From 2a52ca7c98960aafb0eca9ef96b2d0c932171357 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:17 -1000
Subject: sched_ext: Add scx_simple and scx_example_qmap example schedulers

Add two simple example BPF schedulers - simple and qmap.

* simple: In terms of scheduling, it behaves identical to not having any
  operation implemented at all. The two operations it implements are only to
  improve visibility and exit handling. On certain homogeneous
  configurations, this actually can perform pretty well.

* qmap: A fixed five level priority scheduler to demonstrate queueing PIDs
  on BPF maps for scheduling. While not very practical, this is useful as a
  simple example and will be used to demonstrate different features.

v7: - Compat helpers stripped out in prepartion of upstreaming as the
      upstreamed patchset will be the baselinfe. Utility macros that can be
      used to implement compat features are kept.

    - Explicitly disable map autoattach on struct_ops to avoid trying to
      attach twice while maintaining compatbility with older libbpf.

v6: - Common header files reorganized and cleaned up. Compat helpers are
      added to demonstrate how schedulers can maintain backward
      compatibility with older kernels while making use of newly added
      features.

    - simple_select_cpu() added to keep track of the number of local
      dispatches. This is needed because the default ops.select_cpu()
      implementation is updated to dispatch directly and won't call
      ops.enqueue().

    - Updated to reflect the sched_ext API changes. Switching all tasks is
      the default behavior now and scx_qmap supports partial switching when
      `-p` is specified.

    - tools/sched_ext/Kconfig dropped. This will be included in the doc
      instead.

v5: - Improve Makefile. Build artifects are now collected into a separate
      dir which change be changed. Install and help targets are added and
      clean actually cleans everything.

    - MEMBER_VPTR() improved to improve access to structs. ARRAY_ELEM_PTR()
      and RESIZEABLE_ARRAY() are added to support resizable arrays in .bss.

    - Add scx_common.h which provides common utilities to user code such as
      SCX_BUG[_ON]() and RESIZE_ARRAY().

    - Use SCX_BUG[_ON]() to simplify error handling.

v4: - Dropped _example prefix from scheduler names.

v3: - Rename scx_example_dummy to scx_example_simple and restructure a bit
      to ease later additions. Comment updates.

    - Added declarations for BPF inline iterators. In the future, hopefully,
      these will be consolidated into a generic BPF header so that they
      don't need to be replicated here.

v2: - Updated with the generic BPF cpumask helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 tools/Makefile                                 |  10 +-
 tools/sched_ext/.gitignore                     |   2 +
 tools/sched_ext/Makefile                       | 246 ++++++++++++++++
 tools/sched_ext/include/bpf-compat/gnu/stubs.h |  11 +
 tools/sched_ext/include/scx/common.bpf.h       | 379 +++++++++++++++++++++++++
 tools/sched_ext/include/scx/common.h           |  75 +++++
 tools/sched_ext/include/scx/compat.bpf.h       |  28 ++
 tools/sched_ext/include/scx/compat.h           | 153 ++++++++++
 tools/sched_ext/include/scx/user_exit_info.h   |  64 +++++
 tools/sched_ext/scx_qmap.bpf.c                 | 264 +++++++++++++++++
 tools/sched_ext/scx_qmap.c                     |  99 +++++++
 tools/sched_ext/scx_simple.bpf.c               |  63 ++++
 tools/sched_ext/scx_simple.c                   |  99 +++++++
 13 files changed, 1492 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/.gitignore
 create mode 100644 tools/sched_ext/Makefile
 create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h
 create mode 100644 tools/sched_ext/include/scx/common.bpf.h
 create mode 100644 tools/sched_ext/include/scx/common.h
 create mode 100644 tools/sched_ext/include/scx/compat.bpf.h
 create mode 100644 tools/sched_ext/include/scx/compat.h
 create mode 100644 tools/sched_ext/include/scx/user_exit_info.h
 create mode 100644 tools/sched_ext/scx_qmap.bpf.c
 create mode 100644 tools/sched_ext/scx_qmap.c
 create mode 100644 tools/sched_ext/scx_simple.bpf.c
 create mode 100644 tools/sched_ext/scx_simple.c

(limited to 'tools')

diff --git a/tools/Makefile b/tools/Makefile
index 276f5d0d53a4..278d24723b74 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -28,6 +28,7 @@ help:
 	@echo '  pci                    - PCI tools'
 	@echo '  perf                   - Linux performance measurement and analysis tool'
 	@echo '  selftests              - various kernel selftests'
+	@echo '  sched_ext              - sched_ext example schedulers'
 	@echo '  bootconfig             - boot config tool'
 	@echo '  spi                    - spi tools'
 	@echo '  tmon                   - thermal monitoring and tuning tool'
@@ -91,6 +92,9 @@ perf: FORCE
 	$(Q)mkdir -p $(PERF_O) .
 	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir=
 
+sched_ext: FORCE
+	$(call descend,sched_ext)
+
 selftests: FORCE
 	$(call descend,testing/$@)
 
@@ -184,6 +188,9 @@ perf_clean:
 	$(Q)mkdir -p $(PERF_O) .
 	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
 
+sched_ext_clean:
+	$(call descend,sched_ext,clean)
+
 selftests_clean:
 	$(call descend,testing/$(@:_clean=),clean)
 
@@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \
 		mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean \
 		gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \
-		intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean
+		intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \
+		sched_ext_clean
 
 .PHONY: FORCE
diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
new file mode 100644
index 000000000000..d6264fe1c8cd
--- /dev/null
+++ b/tools/sched_ext/.gitignore
@@ -0,0 +1,2 @@
+tools/
+build/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
new file mode 100644
index 000000000000..626782a21375
--- /dev/null
+++ b/tools/sched_ext/Makefile
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+include ../build/Build.include
+include ../scripts/Makefile.arch
+include ../scripts/Makefile.include
+
+all: all_targets
+
+ifneq ($(LLVM),)
+ifneq ($(filter %/,$(LLVM)),)
+LLVM_PREFIX := $(LLVM)
+else ifneq ($(filter -%,$(LLVM)),)
+LLVM_SUFFIX := $(LLVM)
+endif
+
+CLANG_TARGET_FLAGS_arm          := arm-linux-gnueabi
+CLANG_TARGET_FLAGS_arm64        := aarch64-linux-gnu
+CLANG_TARGET_FLAGS_hexagon      := hexagon-linux-musl
+CLANG_TARGET_FLAGS_m68k         := m68k-linux-gnu
+CLANG_TARGET_FLAGS_mips         := mipsel-linux-gnu
+CLANG_TARGET_FLAGS_powerpc      := powerpc64le-linux-gnu
+CLANG_TARGET_FLAGS_riscv        := riscv64-linux-gnu
+CLANG_TARGET_FLAGS_s390         := s390x-linux-gnu
+CLANG_TARGET_FLAGS_x86          := x86_64-linux-gnu
+CLANG_TARGET_FLAGS              := $(CLANG_TARGET_FLAGS_$(ARCH))
+
+ifeq ($(CROSS_COMPILE),)
+ifeq ($(CLANG_TARGET_FLAGS),)
+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk)
+else
+CLANG_FLAGS     += --target=$(CLANG_TARGET_FLAGS)
+endif # CLANG_TARGET_FLAGS
+else
+CLANG_FLAGS     += --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif # CROSS_COMPILE
+
+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
+else
+CC := $(CROSS_COMPILE)gcc
+endif # LLVM
+
+CURDIR := $(abspath .)
+TOOLSDIR := $(abspath ..)
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
+GENDIR := $(abspath ../../include/generated)
+GENHDR := $(GENDIR)/autoconf.h
+
+ifeq ($(O),)
+OUTPUT_DIR := $(CURDIR)/build
+else
+OUTPUT_DIR := $(O)/build
+endif # O
+OBJ_DIR := $(OUTPUT_DIR)/obj
+INCLUDE_DIR := $(OUTPUT_DIR)/include
+BPFOBJ_DIR := $(OBJ_DIR)/libbpf
+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
+BINDIR := $(OUTPUT_DIR)/bin
+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
+ifneq ($(CROSS_COMPILE),)
+HOST_BUILD_DIR		:= $(OBJ_DIR)/host
+HOST_OUTPUT_DIR	:= host-tools
+HOST_INCLUDE_DIR	:= $(HOST_OUTPUT_DIR)/include
+else
+HOST_BUILD_DIR		:= $(OBJ_DIR)
+HOST_OUTPUT_DIR	:= $(OUTPUT_DIR)
+HOST_INCLUDE_DIR	:= $(INCLUDE_DIR)
+endif
+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
+DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool
+
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)					\
+		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)		\
+		     ../../vmlinux						\
+		     /sys/kernel/btf/vmlinux					\
+		     /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+ifneq ($(wildcard $(GENHDR)),)
+  GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
+	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include
+
+# Silence some warnings when compiled with clang
+ifneq ($(LLVM),)
+CFLAGS += -Wno-unused-command-line-argument
+endif
+
+LDFLAGS = -lelf -lz -lpthread
+
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
+	     -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat			\
+	     -I$(INCLUDE_DIR) -I$(APIDIR)					\
+	     -I../../include							\
+	     $(call get_sys_includes,$(CLANG))					\
+	     -Wall -Wno-compare-distinct-pointer-types				\
+	     -O2 -mcpu=v3
+
+# sort removes libbpf duplicates when not cross-building
+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf			\
+	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
+	       $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR))
+
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
+	   $(APIDIR)/linux/bpf.h						\
+	   | $(OBJ_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/	\
+		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
+		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
+
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
+		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
+		    EXTRA_CFLAGS='-g -O0'					\
+		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
+		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
+		    LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/				\
+		    prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+	$(call msg,GEN,,$@)
+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+	$(call msg,CP,,$@)
+	$(Q)cp "$(VMLINUX_H)" $@
+endif
+
+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h		\
+		       | $(BPFOBJ) $(SCXOBJ_DIR)
+	$(call msg,CLNG-BPF,,$(notdir $@))
+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+
+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL)
+	$(eval sched=$(notdir $@))
+	$(call msg,GEN-SKEL,,$(sched))
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
+
+SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
+
+c-sched-targets = scx_simple scx_qmap
+
+$(addprefix $(BINDIR)/,$(c-sched-targets)): \
+	$(BINDIR)/%: \
+		$(filter-out %.bpf.c,%.c) \
+		$(INCLUDE_DIR)/%.bpf.skel.h \
+		$(SCX_COMMON_DEPS)
+	$(eval sched=$(notdir $@))
+	$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
+	$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS)
+
+$(c-sched-targets): %: $(BINDIR)/%
+
+install: all
+	$(Q)mkdir -p $(DESTDIR)/usr/local/bin/
+	$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/
+
+clean:
+	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
+	rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
+	rm -f $(c-sched-targets)
+
+help:
+	@echo   'Building targets'
+	@echo   '================'
+	@echo   ''
+	@echo   '  all		  - Compile all schedulers'
+	@echo   ''
+	@echo   'Alternatively, you may compile individual schedulers:'
+	@echo   ''
+	@printf '  %s\n' $(c-sched-targets)
+	@echo   ''
+	@echo   'For any scheduler build target, you may specify an alternative'
+	@echo   'build output path with the O= environment variable. For example:'
+	@echo   ''
+	@echo   '   O=/tmp/sched_ext make all'
+	@echo   ''
+	@echo   'will compile all schedulers, and emit the build artifacts to'
+	@echo   '/tmp/sched_ext/build.'
+	@echo   ''
+	@echo   ''
+	@echo   'Installing targets'
+	@echo   '=================='
+	@echo   ''
+	@echo   '  install	  - Compile and install all schedulers to /usr/bin.'
+	@echo   '		    You may specify the DESTDIR= environment variable'
+	@echo   '		    to indicate a prefix for /usr/bin. For example:'
+	@echo   ''
+	@echo   '                     DESTDIR=/tmp/sched_ext make install'
+	@echo   ''
+	@echo   '		    will build the schedulers in CWD/build, and'
+	@echo   '		    install the schedulers to /tmp/sched_ext/usr/bin.'
+	@echo   ''
+	@echo   ''
+	@echo   'Cleaning targets'
+	@echo   '================'
+	@echo   ''
+	@echo   '  clean		  - Remove all generated files'
+
+all_targets: $(c-sched-targets)
+
+.PHONY: all all_targets $(c-sched-targets) clean help
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets
+.SECONDARY:
diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h
new file mode 100644
index 000000000000..ad7d139ce907
--- /dev/null
+++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h
@@ -0,0 +1,11 @@
+/*
+ * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when
+ * compiling BPF files although its content doesn't play any role. The file in
+ * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is
+ * defined. When compiling a BPF source, __x86_64__ isn't set and thus
+ * stubs-32.h is selected. However, the file is not there if the system doesn't
+ * have 32bit glibc devel package installed leading to a build failure.
+ *
+ * The problem is worked around by making this file available in the include
+ * search paths before the system one when building BPF.
+ */
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
new file mode 100644
index 000000000000..833fe1bdccf9
--- /dev/null
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -0,0 +1,379 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMMON_BPF_H
+#define __SCX_COMMON_BPF_H
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <asm-generic/errno.h>
+#include "user_exit_info.h"
+
+#define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
+#define PF_KTHREAD			0x00200000	/* I am a kernel thread */
+#define PF_EXITING			0x00000004
+#define CLOCK_MONOTONIC			1
+
+/*
+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
+ * lead to really confusing misbehaviors. Let's trigger a build failure.
+ */
+static inline void ___vmlinux_h_sanity_check___(void)
+{
+	_Static_assert(SCX_DSQ_FLAG_BUILTIN,
+		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
+}
+
+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
+void scx_bpf_dispatch_cancel(void) __ksym;
+bool scx_bpf_consume(u64 dsq_id) __ksym;
+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
+const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
+const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
+void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+
+static inline __attribute__((format(printf, 1, 2)))
+void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
+
+/*
+ * Helper macro for initializing the fmt and variadic argument inputs to both
+ * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
+ * refer to the initialized list of inputs to the bstr kfunc.
+ */
+#define scx_bpf_bstr_preamble(fmt, args...)					\
+	static char ___fmt[] = fmt;						\
+	/*									\
+	 * Note that __param[] must have at least one				\
+	 * element to keep the verifier happy.					\
+	 */									\
+	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
+										\
+	_Pragma("GCC diagnostic push")						\
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
+	___bpf_fill(___param, args);						\
+	_Pragma("GCC diagnostic pop")						\
+
+/*
+ * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Using this macro will cause the scheduler to
+ * exit cleanly with the specified exit code being passed to user space.
+ */
+#define scx_bpf_exit(code, fmt, args...)					\
+({										\
+	scx_bpf_bstr_preamble(fmt, args)					\
+	scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param));		\
+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
+})
+
+/*
+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Invoking this macro will cause the scheduler to
+ * exit in an erroneous state, with diagnostic information being passed to the
+ * user.
+ */
+#define scx_bpf_error(fmt, args...)						\
+({										\
+	scx_bpf_bstr_preamble(fmt, args)					\
+	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
+})
+
+#define BPF_STRUCT_OPS(name, args...)						\
+SEC("struct_ops/"#name)								\
+BPF_PROG(name, ##args)
+
+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...)					\
+SEC("struct_ops.s/"#name)							\
+BPF_PROG(name, ##args)
+
+/**
+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized
+ * @elfsec: the data section of the BPF program in which to place the array
+ * @arr: the name of the array
+ *
+ * libbpf has an API for setting map value sizes. Since data sections (i.e.
+ * bss, data, rodata) themselves are maps, a data section can be resized. If
+ * a data section has an array as its last element, the BTF info for that
+ * array will be adjusted so that length of the array is extended to meet the
+ * new length of the data section. This macro annotates an array to have an
+ * element count of one with the assumption that this array can be resized
+ * within the userspace program. It also annotates the section specifier so
+ * this array exists in a custom sub data section which can be resized
+ * independently.
+ *
+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an
+ * array declared with RESIZABLE_ARRAY().
+ */
+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr)
+
+/**
+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
+ * @base: struct or array to index
+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...)
+ *
+ * The verifier often gets confused by the instruction sequence the compiler
+ * generates for indexing struct fields or arrays. This macro forces the
+ * compiler to generate a code sequence which first calculates the byte offset,
+ * checks it against the struct or array size and add that byte offset to
+ * generate the pointer to the member to help the verifier.
+ *
+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller
+ * must check for %NULL and take appropriate action to appease the verifier. To
+ * avoid confusing the verifier, it's best to check for %NULL and dereference
+ * immediately.
+ *
+ *	vptr = MEMBER_VPTR(my_array, [i][j]);
+ *	if (!vptr)
+ *		return error;
+ *	*vptr = new_value;
+ *
+ * sizeof(@base) should encompass the memory area to be accessed and thus can't
+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
+ * `MEMBER_VPTR(ptr, ->member)`.
+ */
+#define MEMBER_VPTR(base, member) (typeof((base) member) *)			\
+({										\
+	u64 __base = (u64)&(base);						\
+	u64 __addr = (u64)&((base) member) - __base;				\
+	_Static_assert(sizeof(base) >= sizeof((base) member),			\
+		       "@base is smaller than @member, is @base a pointer?");	\
+	asm volatile (								\
+		"if %0 <= %[max] goto +2\n"					\
+		"%0 = 0\n"							\
+		"goto +1\n"							\
+		"%0 += %1\n"							\
+		: "+r"(__addr)							\
+		: "r"(__base),							\
+		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
+	__addr;									\
+})
+
+/**
+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
+ * @arr: array to index into
+ * @i: array index
+ * @n: number of elements in array
+ *
+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the
+ * element count needs to be explicit.
+ * It can be used in cases where a global array is defined with an initial
+ * size but is intended to be be resized before loading the BPF program.
+ * Without this version of the macro, MEMBER_VPTR() will use the compile time
+ * size of the array to compute the max, which will result in rejection by
+ * the verifier.
+ */
+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)				\
+({										\
+	u64 __base = (u64)arr;							\
+	u64 __addr = (u64)&(arr[i]) - __base;					\
+	asm volatile (								\
+		"if %0 <= %[max] goto +2\n"					\
+		"%0 = 0\n"							\
+		"goto +1\n"							\
+		"%0 += %1\n"							\
+		: "+r"(__addr)							\
+		: "r"(__base),							\
+		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));			\
+	__addr;									\
+})
+
+
+/*
+ * BPF declarations and helpers
+ */
+
+/* list and rbtree */
+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+
+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
+
+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
+
+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+				      struct bpf_rb_node *node) __ksym;
+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+			bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+			void *meta, __u64 off) __ksym;
+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
+
+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
+
+void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
+#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
+
+/* task */
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
+/* cgroup */
+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
+void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
+
+/* css iteration */
+struct bpf_iter_css;
+struct cgroup_subsys_state;
+extern int bpf_iter_css_new(struct bpf_iter_css *it,
+			    struct cgroup_subsys_state *start,
+			    unsigned int flags) __weak __ksym;
+extern struct cgroup_subsys_state *
+bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
+extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
+
+/* cpumask */
+struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
+		    const struct cpumask *src2) __ksym;
+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+				   const struct cpumask *src2) __ksym;
+
+/* rcu */
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
+
+
+/*
+ * Other helpers
+ */
+
+/* useful compiler attributes */
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define __maybe_unused __attribute__((__unused__))
+
+/*
+ * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
+ * prevent compiler from caching, redoing or reordering reads or writes.
+ */
+typedef __u8  __attribute__((__may_alias__))  __u8_alias_t;
+typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
+typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
+typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
+
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(__u8_alias_t  *) res = *(volatile __u8_alias_t  *) p; break;
+	case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
+	case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
+	case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)res, (const void *)p, size);
+		barrier();
+	}
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile  __u8_alias_t *) p = *(__u8_alias_t  *) res; break;
+	case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
+	case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
+	case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)p, (const void *)res, size);
+		barrier();
+	}
+}
+
+#define READ_ONCE(x)					\
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__c = { 0 } };			\
+	__read_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
+#define WRITE_ONCE(x, val)				\
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__val = (val) }; 			\
+	__write_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
+/*
+ * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
+ * @v: The value for which we're computing the base 2 logarithm.
+ */
+static inline u32 log2_u32(u32 v)
+{
+        u32 r;
+        u32 shift;
+
+        r = (v > 0xFFFF) << 4; v >>= r;
+        shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+        shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+        shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+        r |= (v >> 1);
+        return r;
+}
+
+/*
+ * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value.
+ * @v: The value for which we're computing the base 2 logarithm.
+ */
+static inline u32 log2_u64(u64 v)
+{
+        u32 hi = v >> 32;
+        if (hi)
+                return log2_u32(hi) + 32 + 1;
+        else
+                return log2_u32(v) + 1;
+}
+
+#include "compat.bpf.h"
+
+#endif	/* __SCX_COMMON_BPF_H */
diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
new file mode 100644
index 000000000000..5b0f90152152
--- /dev/null
+++ b/tools/sched_ext/include/scx/common.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCHED_EXT_COMMON_H
+#define __SCHED_EXT_COMMON_H
+
+#ifdef __KERNEL__
+#error "Should not be included by BPF programs"
+#endif
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+#define SCX_BUG(__fmt, ...)							\
+	do {									\
+		fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__);		\
+		if (errno)							\
+			fprintf(stderr, " (%s)\n", strerror(errno));		\
+		else								\
+			fprintf(stderr, "\n");					\
+		fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__);		\
+		fprintf(stderr, "\n");						\
+										\
+		exit(EXIT_FAILURE);						\
+	} while (0)
+
+#define SCX_BUG_ON(__cond, __fmt, ...)					\
+	do {								\
+		if (__cond)						\
+			SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__);	\
+	} while (0)
+
+/**
+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array
+ * @__skel: the skeleton containing the array
+ * @elfsec: the data section of the BPF program in which the array exists
+ * @arr: the name of the array
+ * @n: the desired array element count
+ *
+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two
+ * operations. It resizes the map which corresponds to the custom data
+ * section that contains the target array. As a side effect, the BTF info for
+ * the array is adjusted so that the array length is sized to cover the new
+ * data section size. The second operation is reassigning the skeleton pointer
+ * for that custom data section so that it points to the newly memory mapped
+ * region.
+ */
+#define RESIZE_ARRAY(__skel, elfsec, arr, n)						\
+	do {										\
+		size_t __sz;								\
+		bpf_map__set_value_size((__skel)->maps.elfsec##_##arr,			\
+				sizeof((__skel)->elfsec##_##arr->arr[0]) * (n));	\
+		(__skel)->elfsec##_##arr =						\
+			bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz);	\
+	} while (0)
+
+#include "user_exit_info.h"
+#include "compat.h"
+
+#endif	/* __SCHED_EXT_COMMON_H */
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
new file mode 100644
index 000000000000..3d2fe1208900
--- /dev/null
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMPAT_BPF_H
+#define __SCX_COMPAT_BPF_H
+
+#define __COMPAT_ENUM_OR_ZERO(__type, __ent)					\
+({										\
+	__type __ret = 0;							\
+	if (bpf_core_enum_value_exists(__type, __ent))				\
+		__ret = __ent;							\
+	__ret;									\
+})
+
+/*
+ * Define sched_ext_ops. This may be expanded to define multiple variants for
+ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
+ */
+#define SCX_OPS_DEFINE(__name, ...)						\
+	SEC(".struct_ops.link")							\
+	struct sched_ext_ops __name = {						\
+		__VA_ARGS__,							\
+	};
+
+#endif	/* __SCX_COMPAT_BPF_H */
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
new file mode 100644
index 000000000000..a7fdaf8a858e
--- /dev/null
+++ b/tools/sched_ext/include/scx/compat.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMPAT_H
+#define __SCX_COMPAT_H
+
+#include <bpf/btf.h>
+
+struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
+
+static inline void __COMPAT_load_vmlinux_btf(void)
+{
+	if (!__COMPAT_vmlinux_btf) {
+		__COMPAT_vmlinux_btf = btf__load_vmlinux_btf();
+		SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()");
+	}
+}
+
+static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v)
+{
+	const struct btf_type *t;
+	const char *n;
+	s32 tid;
+	int i;
+
+	__COMPAT_load_vmlinux_btf();
+
+	tid = btf__find_by_name(__COMPAT_vmlinux_btf, type);
+	if (tid < 0)
+		return false;
+
+	t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
+	SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
+
+	if (btf_is_enum(t)) {
+		struct btf_enum *e = btf_enum(t);
+
+		for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+			n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
+			SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, name)) {
+				*v = e[i].val;
+				return true;
+			}
+		}
+	} else if (btf_is_enum64(t)) {
+		struct btf_enum64 *e = btf_enum64(t);
+
+		for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+			n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
+			SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, name)) {
+				*v = btf_enum64_value(&e[i]);
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+#define __COMPAT_ENUM_OR_ZERO(__type, __ent)					\
+({										\
+	u64 __val = 0;								\
+	__COMPAT_read_enum(__type, __ent, &__val);				\
+	__val;									\
+})
+
+static inline bool __COMPAT_has_ksym(const char *ksym)
+{
+	__COMPAT_load_vmlinux_btf();
+	return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
+}
+
+static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
+{
+	const struct btf_type *t;
+	const struct btf_member *m;
+	const char *n;
+	s32 tid;
+	int i;
+
+	__COMPAT_load_vmlinux_btf();
+	tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT);
+	if (tid < 0)
+		return false;
+
+	t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
+	SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
+
+	m = btf_members(t);
+
+	for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+		n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off);
+		SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, field))
+				return true;
+	}
+
+	return false;
+}
+
+#define SCX_OPS_SWITCH_PARTIAL							\
+	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
+
+/*
+ * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
+ * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
+ * and attach it, backward compatibility is automatically maintained where
+ * reasonable.
+ */
+#define SCX_OPS_OPEN(__ops_name, __scx_name) ({					\
+	struct __scx_name *__skel;						\
+										\
+	__skel = __scx_name##__open();						\
+	SCX_BUG_ON(!__skel, "Could not open " #__scx_name);			\
+	__skel; 								\
+})
+
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({				\
+	SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel");	\
+})
+
+/*
+ * New versions of bpftool now emit additional link placeholders for BPF maps,
+ * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps
+ * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do
+ * nothing with those links and won't attempt to auto-attach maps.
+ *
+ * To maintain compatibility with older libbpf while avoiding trying to attach
+ * twice, disable the autoattach feature on newer libbpf.
+ */
+#if LIBBPF_MAJOR_VERSION > 1 ||							\
+	(LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5)
+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name)			\
+	bpf_map__set_autoattach((__skel)->maps.__ops_name, false)
+#else
+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0)
+#endif
+
+#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({			\
+	struct bpf_link *__link;						\
+	__SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name);			\
+	SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel");	\
+	__link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name);		\
+	SCX_BUG_ON(!__link, "Failed to attach struct_ops");			\
+	__link;									\
+})
+
+#endif	/* __SCX_COMPAT_H */
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
new file mode 100644
index 000000000000..8c3b7fac4d05
--- /dev/null
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __USER_EXIT_INFO_H
+#define __USER_EXIT_INFO_H
+
+enum uei_sizes {
+	UEI_REASON_LEN		= 128,
+	UEI_MSG_LEN		= 1024,
+};
+
+struct user_exit_info {
+	int		kind;
+	s64		exit_code;
+	char		reason[UEI_REASON_LEN];
+	char		msg[UEI_MSG_LEN];
+};
+
+#ifdef __bpf__
+
+#include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
+
+#define UEI_DEFINE(__name)							\
+	struct user_exit_info __name SEC(".data")
+
+#define UEI_RECORD(__uei_name, __ei) ({						\
+	bpf_probe_read_kernel_str(__uei_name.reason,				\
+				  sizeof(__uei_name.reason), (__ei)->reason);	\
+	bpf_probe_read_kernel_str(__uei_name.msg,				\
+				  sizeof(__uei_name.msg), (__ei)->msg);		\
+	if (bpf_core_field_exists((__ei)->exit_code))				\
+		__uei_name.exit_code = (__ei)->exit_code;			\
+	/* use __sync to force memory barrier */				\
+	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
+				    (__ei)->kind);				\
+})
+
+#else	/* !__bpf__ */
+
+#include <stdio.h>
+#include <stdbool.h>
+
+#define UEI_EXITED(__skel, __uei_name) ({					\
+	/* use __sync to force memory barrier */				\
+	__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1);	\
+})
+
+#define UEI_REPORT(__skel, __uei_name) ({					\
+	struct user_exit_info *__uei = &(__skel)->data->__uei_name;		\
+	fprintf(stderr, "EXIT: %s", __uei->reason);				\
+	if (__uei->msg[0] != '\0')						\
+		fprintf(stderr, " (%s)", __uei->msg);				\
+	fputs("\n", stderr);							\
+})
+
+#endif	/* __bpf__ */
+#endif	/* __USER_EXIT_INFO_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
new file mode 100644
index 000000000000..976a2693da71
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A simple five-level FIFO queue scheduler.
+ *
+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
+ * assigned to one depending on its compound weight. Each CPU round robins
+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
+ * queue0, 2 from queue1, 4 from queue2 and so on.
+ *
+ * This scheduler demonstrates:
+ *
+ * - BPF-side queueing using PIDs.
+ * - Sleepable per-task storage allocation using ops.prep_enable().
+ *
+ * This scheduler is primarily for demonstration and testing of sched_ext
+ * features and unlikely to be useful for actual workloads.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+
+enum consts {
+	ONE_SEC_IN_NS		= 1000000000,
+	SHARED_DSQ		= 0,
+};
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u32 dsp_batch;
+
+u32 test_error_cnt;
+
+UEI_DEFINE(uei);
+
+struct qmap {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 4096);
+	__type(value, u32);
+} queue0 SEC(".maps"),
+  queue1 SEC(".maps"),
+  queue2 SEC(".maps"),
+  queue3 SEC(".maps"),
+  queue4 SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, 5);
+	__type(key, int);
+	__array(values, struct qmap);
+} queue_arr SEC(".maps") = {
+	.values = {
+		[0] = &queue0,
+		[1] = &queue1,
+		[2] = &queue2,
+		[3] = &queue3,
+		[4] = &queue4,
+	},
+};
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool	force_local;	/* Dispatch directly to local_dsq */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+struct cpu_ctx {
+	u64	dsp_idx;	/* dispatch index */
+	u64	dsp_cnt;	/* remaining count */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct cpu_ctx);
+} cpu_ctx_stor SEC(".maps");
+
+/* Statistics */
+u64 nr_enqueued, nr_dispatched, nr_dequeued;
+
+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+	s32 cpu;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return -ESRCH;
+	}
+
+	if (p->nr_cpus_allowed == 1 ||
+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+		tctx->force_local = true;
+		return prev_cpu;
+	}
+
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+	if (cpu >= 0)
+		return cpu;
+
+	return prev_cpu;
+}
+
+static int weight_to_idx(u32 weight)
+{
+	/* Coarsely map the compound weight to a FIFO. */
+	if (weight <= 25)
+		return 0;
+	else if (weight <= 50)
+		return 1;
+	else if (weight < 200)
+		return 2;
+	else if (weight < 400)
+		return 3;
+	else
+		return 4;
+}
+
+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+	u32 pid = p->pid;
+	int idx = weight_to_idx(p->scx.weight);
+	void *ring;
+
+	if (test_error_cnt && !--test_error_cnt)
+		scx_bpf_error("test triggering error");
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	/* Is select_cpu() is telling us to enqueue locally? */
+	if (tctx->force_local) {
+		tctx->force_local = false;
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
+		return;
+	}
+
+	ring = bpf_map_lookup_elem(&queue_arr, &idx);
+	if (!ring) {
+		scx_bpf_error("failed to find ring %d", idx);
+		return;
+	}
+
+	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
+	if (bpf_map_push_elem(ring, &pid, 0)) {
+		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
+		return;
+	}
+
+	__sync_fetch_and_add(&nr_enqueued, 1);
+}
+
+/*
+ * The BPF queue map doesn't support removal and sched_ext can handle spurious
+ * dispatches. qmap_dequeue() is only used to collect statistics.
+ */
+void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	__sync_fetch_and_add(&nr_dequeued, 1);
+}
+
+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
+{
+	struct task_struct *p;
+	struct cpu_ctx *cpuc;
+	u32 zero = 0, batch = dsp_batch ?: 1;
+	void *fifo;
+	s32 i, pid;
+
+	if (scx_bpf_consume(SHARED_DSQ))
+		return;
+
+	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
+		scx_bpf_error("failed to look up cpu_ctx");
+		return;
+	}
+
+	for (i = 0; i < 5; i++) {
+		/* Advance the dispatch cursor and pick the fifo. */
+		if (!cpuc->dsp_cnt) {
+			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
+			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
+		}
+
+		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
+		if (!fifo) {
+			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
+			return;
+		}
+
+		/* Dispatch or advance. */
+		bpf_repeat(BPF_MAX_LOOPS) {
+			if (bpf_map_pop_elem(fifo, &pid))
+				break;
+
+			p = bpf_task_from_pid(pid);
+			if (!p)
+				continue;
+
+			__sync_fetch_and_add(&nr_dispatched, 1);
+			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
+			bpf_task_release(p);
+			batch--;
+			cpuc->dsp_cnt--;
+			if (!batch || !scx_bpf_dispatch_nr_slots()) {
+				scx_bpf_consume(SHARED_DSQ);
+				return;
+			}
+			if (!cpuc->dsp_cnt)
+				break;
+		}
+
+		cpuc->dsp_cnt = 0;
+	}
+}
+
+s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	/*
+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
+	 * in this function and the following will automatically use GFP_KERNEL.
+	 */
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
+{
+	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+}
+
+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(qmap_ops,
+	       .select_cpu		= (void *)qmap_select_cpu,
+	       .enqueue			= (void *)qmap_enqueue,
+	       .dequeue			= (void *)qmap_dequeue,
+	       .dispatch		= (void *)qmap_dispatch,
+	       .init_task		= (void *)qmap_init_task,
+	       .init			= (void *)qmap_init,
+	       .exit			= (void *)qmap_exit,
+	       .name			= "qmap");
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
new file mode 100644
index 000000000000..7c84ade7ecfb
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_qmap.bpf.skel.h"
+
+const char help_fmt[] =
+"A simple five-level FIFO queue sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-b COUNT] [-p] [-v]\n"
+"\n"
+"  -s SLICE_US   Override slice duration\n"
+"  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
+"  -b COUNT      Dispatch upto COUNT tasks together\n"
+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_qmap *skel;
+	struct bpf_link *link;
+	int opt;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
+
+	while ((opt = getopt(argc, argv, "s:e:b:pvh")) != -1) {
+		switch (opt) {
+		case 's':
+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
+			break;
+		case 'e':
+			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+			break;
+		case 'b':
+			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
+			break;
+		case 'p':
+			skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap);
+	link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		long nr_enqueued = skel->bss->nr_enqueued;
+		long nr_dispatched = skel->bss->nr_dispatched;
+
+		printf("stats  : enq=%lu dsp=%lu delta=%ld deq=%"PRIu64"\n",
+		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
+		       skel->bss->nr_dequeued);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	UEI_REPORT(skel, uei);
+	scx_qmap__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
new file mode 100644
index 000000000000..6bb13a3c801b
--- /dev/null
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A simple scheduler.
+ *
+ * A simple global FIFO scheduler. It also demonstrates the following niceties.
+ *
+ * - Statistics tracking how many tasks are queued to local and global dsq's.
+ * - Termination notification for userspace.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u64));
+	__uint(max_entries, 2);			/* [local, global] */
+} stats SEC(".maps");
+
+static void stat_inc(u32 idx)
+{
+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+	if (cnt_p)
+		(*cnt_p)++;
+}
+
+s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+	bool is_idle = false;
+	s32 cpu;
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+	if (is_idle) {
+		stat_inc(0);	/* count local queueing */
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+	}
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	stat_inc(1);	/* count global queueing */
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(simple_ops,
+	       .select_cpu		= (void *)simple_select_cpu,
+	       .enqueue			= (void *)simple_enqueue,
+	       .exit			= (void *)simple_exit,
+	       .name			= "simple");
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
new file mode 100644
index 000000000000..789ac62fea8e
--- /dev/null
+++ b/tools/sched_ext/scx_simple.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_simple.bpf.skel.h"
+
+const char help_fmt[] =
+"A simple sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-v]\n"
+"\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int simple)
+{
+	exit_req = 1;
+}
+
+static void read_stats(struct scx_simple *skel, __u64 *stats)
+{
+	int nr_cpus = libbpf_num_possible_cpus();
+	__u64 cnts[2][nr_cpus];
+	__u32 idx;
+
+	memset(stats, 0, sizeof(stats[0]) * 2);
+
+	for (idx = 0; idx < 2; idx++) {
+		int ret, cpu;
+
+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+					  &idx, cnts[idx]);
+		if (ret < 0)
+			continue;
+		for (cpu = 0; cpu < nr_cpus; cpu++)
+			stats[idx] += cnts[idx][cpu];
+	}
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_simple *skel;
+	struct bpf_link *link;
+	__u32 opt;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	skel = SCX_OPS_OPEN(simple_ops, scx_simple);
+
+	while ((opt = getopt(argc, argv, "vh")) != -1) {
+		switch (opt) {
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	SCX_OPS_LOAD(skel, simple_ops, scx_simple);
+	link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		__u64 stats[2];
+
+		read_stats(skel, stats);
+		printf("local=%llu global=%llu\n", stats[0], stats[1]);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	UEI_REPORT(skel, uei);
+	scx_simple__destroy(skel);
+	return 0;
+}
-- 
cgit v1.2.3


From 8a010b81b3a50b033fc3cddc613517abda586cbe Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Tue, 18 Jun 2024 10:09:18 -1000
Subject: sched_ext: Implement runnable task stall watchdog

The most common and critical way that a BPF scheduler can misbehave is by
failing to run runnable tasks for too long. This patch implements a
watchdog.

* All tasks record when they become runnable.

* A watchdog work periodically scans all runnable tasks. If any task has
  stayed runnable for too long, the BPF scheduler is aborted.

* scheduler_tick() monitors whether the watchdog itself is stuck. If so, the
  BPF scheduler is aborted.

Because the watchdog only scans the tasks which are currently runnable and
usually very infrequently, the overhead should be negligible.
scx_qmap is updated so that it can be told to stall user and/or
kernel tasks.

A detected task stall looks like the following:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (dbus-daemon[953] failed to run for 6.478s)
    scx_check_timeout_workfn+0x10e/0x1b0
    process_one_work+0x287/0x560
    worker_thread+0x234/0x420
    kthread+0xe9/0x100
    ret_from_fork+0x1f/0x30

A detected watchdog stall:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (watchdog failed to check in for 5.001s)
    scheduler_tick+0x2eb/0x340
    update_process_times+0x7a/0x90
    tick_sched_timer+0xd8/0x130
    __hrtimer_run_queues+0x178/0x3b0
    hrtimer_interrupt+0xfc/0x390
    __sysvec_apic_timer_interrupt+0xb7/0x2b0
    sysvec_apic_timer_interrupt+0x90/0xb0
    asm_sysvec_apic_timer_interrupt+0x1b/0x20
    default_idle+0x14/0x20
    arch_cpu_idle+0xf/0x20
    default_idle_call+0x50/0x90
    do_idle+0xe8/0x240
    cpu_startup_entry+0x1d/0x20
    kernel_init+0x0/0x190
    start_kernel+0x0/0x392
    start_kernel+0x324/0x392
    x86_64_start_reservations+0x2a/0x2c
    x86_64_start_kernel+0x104/0x109
    secondary_startup_64_no_verify+0xce/0xdb

Note that this patch exposes scx_ops_error[_type]() in kernel/sched/ext.h to
inline scx_notify_sched_tick().

v4: - While disabling, cancel_delayed_work_sync(&scx_watchdog_work) was
      being called before forward progress was guaranteed and thus could
      lead to system lockup. Relocated.

    - While enabling, it was comparing msecs against jiffies without
      conversion leading to spurious load failures on lower HZ kernels.
      Fixed.

    - runnable list management is now used by core bypass logic and moved to
      the patch implementing sched_ext core.

v3: - bpf_scx_init_member() was incorrectly comparing ops->timeout_ms
      against SCX_WATCHDOG_MAX_TIMEOUT which is in jiffies without
      conversion leading to spurious load failures in lower HZ kernels.
      Fixed.

v2: - Julia Lawall noticed that the watchdog code was mixing msecs and
      jiffies. Fix by using jiffies for everything.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
---
 include/linux/sched/ext.h      |   1 +
 init/init_task.c               |   1 +
 kernel/sched/core.c            |   1 +
 kernel/sched/ext.c             | 130 +++++++++++++++++++++++++++++++++++++++--
 kernel/sched/ext.h             |   2 +
 tools/sched_ext/scx_qmap.bpf.c |  12 ++++
 tools/sched_ext/scx_qmap.c     |  12 +++-
 7 files changed, 153 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index c1530a7992cc..96031252436f 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -122,6 +122,7 @@ struct sched_ext_entity {
 	atomic_long_t		ops_state;
 
 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
+	unsigned long		runnable_at;
 
 	u64			ddsp_dsq_id;
 	u64			ddsp_enq_flags;
diff --git a/init/init_task.c b/init/init_task.c
index c6804396fe12..8a44c932d10f 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -106,6 +106,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 		.sticky_cpu	= -1,
 		.holding_cpu	= -1,
 		.runnable_node	= LIST_HEAD_INIT(init_task.scx.runnable_node),
+		.runnable_at	= INITIAL_JIFFIES,
 		.ddsp_dsq_id	= SCX_DSQ_INVALID,
 		.slice		= SCX_SLICE_DFL,
 	},
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6042ce3bfee0..f4365becdc13 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5516,6 +5516,7 @@ void sched_tick(void)
 	calc_global_load_tick(rq);
 	sched_core_tick(rq);
 	task_tick_mm_cid(rq, curr);
+	scx_tick(rq);
 
 	rq_unlock(rq, &rf);
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1f5d80df263a..3dc515b3351f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8,6 +8,7 @@
 
 enum scx_consts {
 	SCX_DSP_DFL_MAX_BATCH		= 32,
+	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
 
 	SCX_EXIT_BT_LEN			= 64,
 	SCX_EXIT_MSG_LEN		= 1024,
@@ -24,6 +25,7 @@ enum scx_exit_kind {
 
 	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
 	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
+	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
 };
 
 /*
@@ -319,6 +321,15 @@ struct sched_ext_ops {
 	 */
 	u64 flags;
 
+	/**
+	 * timeout_ms - The maximum amount of time, in milliseconds, that a
+	 * runnable task should be able to wait before being scheduled. The
+	 * maximum timeout may not exceed the default timeout of 30 seconds.
+	 *
+	 * Defaults to the maximum allowed timeout value of 30 seconds.
+	 */
+	u32 timeout_ms;
+
 	/**
 	 * name - BPF scheduler's name
 	 *
@@ -472,6 +483,23 @@ struct static_key_false scx_has_op[SCX_OPI_END] =
 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
 static struct scx_exit_info *scx_exit_info;
 
+/*
+ * The maximum amount of time in jiffies that a task may be runnable without
+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_ops_error().
+ */
+static unsigned long scx_watchdog_timeout;
+
+/*
+ * The last time the delayed work was run. This delayed work relies on
+ * ksoftirqd being able to run to service timer interrupts, so it's possible
+ * that this work itself could get wedged. To account for this, we check that
+ * it's not stalled in the timer tick, and trigger an error if it is.
+ */
+static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
+
+static struct delayed_work scx_watchdog_work;
+
 /* idle tracking */
 #ifdef CONFIG_SMP
 #ifdef CONFIG_CPUMASK_OFFSTACK
@@ -1170,6 +1198,11 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
 {
 	lockdep_assert_rq_held(rq);
 
+	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
+		p->scx.runnable_at = jiffies;
+		p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
+	}
+
 	/*
 	 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
 	 * appened to the runnable_list.
@@ -1177,9 +1210,11 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
 	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
 }
 
-static void clr_task_runnable(struct task_struct *p)
+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
 {
 	list_del_init(&p->scx.runnable_node);
+	if (reset_runnable_at)
+		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 }
 
 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
@@ -1217,7 +1252,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 {
 	unsigned long opss;
 
-	clr_task_runnable(p);
+	/* dequeue is always temporary, don't reset runnable_at */
+	clr_task_runnable(p, false);
 
 	/* acquire ensures that we see the preceding updates on QUEUED */
 	opss = atomic_long_read_acquire(&p->scx.ops_state);
@@ -1826,7 +1862,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 
 	p->se.exec_start = rq_clock_task(rq);
 
-	clr_task_runnable(p);
+	clr_task_runnable(p, true);
 }
 
 static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -2176,9 +2212,71 @@ static void reset_idle_masks(void) {}
 
 #endif	/* CONFIG_SMP */
 
-static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+static bool check_rq_for_timeouts(struct rq *rq)
+{
+	struct task_struct *p;
+	struct rq_flags rf;
+	bool timed_out = false;
+
+	rq_lock_irqsave(rq, &rf);
+	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+		unsigned long last_runnable = p->scx.runnable_at;
+
+		if (unlikely(time_after(jiffies,
+					last_runnable + scx_watchdog_timeout))) {
+			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
+
+			scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
+					   "%s[%d] failed to run for %u.%03us",
+					   p->comm, p->pid,
+					   dur_ms / 1000, dur_ms % 1000);
+			timed_out = true;
+			break;
+		}
+	}
+	rq_unlock_irqrestore(rq, &rf);
+
+	return timed_out;
+}
+
+static void scx_watchdog_workfn(struct work_struct *work)
+{
+	int cpu;
+
+	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+
+	for_each_online_cpu(cpu) {
+		if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
+			break;
+
+		cond_resched();
+	}
+	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+			   scx_watchdog_timeout / 2);
+}
+
+void scx_tick(struct rq *rq)
 {
+	unsigned long last_check;
+
+	if (!scx_enabled())
+		return;
+
+	last_check = READ_ONCE(scx_watchdog_timestamp);
+	if (unlikely(time_after(jiffies,
+				last_check + READ_ONCE(scx_watchdog_timeout)))) {
+		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
+
+		scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
+				   "watchdog failed to check in for %u.%03us",
+				   dur_ms / 1000, dur_ms % 1000);
+	}
+
 	update_other_load_avgs(rq);
+}
+
+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+{
 	update_curr_scx(rq);
 
 	/*
@@ -2248,6 +2346,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
 
 	scx_set_task_state(p, SCX_TASK_INIT);
 
+	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 	return 0;
 }
 
@@ -2326,6 +2425,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
 	scx->sticky_cpu = -1;
 	scx->holding_cpu = -1;
 	INIT_LIST_HEAD(&scx->runnable_node);
+	scx->runnable_at = jiffies;
 	scx->ddsp_dsq_id = SCX_DSQ_INVALID;
 	scx->slice = SCX_SLICE_DFL;
 }
@@ -2783,6 +2883,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 		return "runtime error";
 	case SCX_EXIT_ERROR_BPF:
 		return "scx_bpf_error";
+	case SCX_EXIT_ERROR_STALL:
+		return "runnable task stall";
 	default:
 		return "<UNKNOWN>";
 	}
@@ -2904,6 +3006,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	if (scx_ops.exit)
 		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
 
+	cancel_delayed_work_sync(&scx_watchdog_work);
+
 	/*
 	 * Delete the kobject from the hierarchy eagerly in addition to just
 	 * dropping a reference. Otherwise, if the object is deleted
@@ -3026,6 +3130,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 {
 	struct scx_task_iter sti;
 	struct task_struct *p;
+	unsigned long timeout;
 	int i, ret;
 
 	mutex_lock(&scx_ops_enable_mutex);
@@ -3103,6 +3208,16 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		goto err_disable;
 	}
 
+	if (ops->timeout_ms)
+		timeout = msecs_to_jiffies(ops->timeout_ms);
+	else
+		timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
+	WRITE_ONCE(scx_watchdog_timeout, timeout);
+	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+			   scx_watchdog_timeout / 2);
+
 	/*
 	 * Lock out forks before opening the floodgate so that they don't wander
 	 * into the operations prematurely.
@@ -3413,6 +3528,12 @@ static int bpf_scx_init_member(const struct btf_type *t,
 		if (ret == 0)
 			return -EINVAL;
 		return 1;
+	case offsetof(struct sched_ext_ops, timeout_ms):
+		if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
+		    SCX_WATCHDOG_MAX_TIMEOUT)
+			return -E2BIG;
+		ops->timeout_ms = *(u32 *)(udata + moff);
+		return 1;
 	}
 
 	return 0;
@@ -3569,6 +3690,7 @@ void __init init_sched_ext_class(void)
 	}
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
 }
 
 
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 9c5a2d928281..56fcdb0b2c05 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -29,6 +29,7 @@ static inline bool task_on_scx(const struct task_struct *p)
 	return scx_enabled() && p->sched_class == &ext_sched_class;
 }
 
+void scx_tick(struct rq *rq);
 void init_scx_entity(struct sched_ext_entity *scx);
 void scx_pre_fork(struct task_struct *p);
 int scx_fork(struct task_struct *p);
@@ -66,6 +67,7 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
 #define scx_enabled()		false
 #define scx_switched_all()	false
 
+static inline void scx_tick(struct rq *rq) {}
 static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 976a2693da71..8beae08dfdc7 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -29,6 +29,8 @@ enum consts {
 char _license[] SEC("license") = "GPL";
 
 const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u32 stall_user_nth;
+const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_batch;
 
 u32 test_error_cnt;
@@ -129,11 +131,20 @@ static int weight_to_idx(u32 weight)
 
 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
+	static u32 user_cnt, kernel_cnt;
 	struct task_ctx *tctx;
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
 
+	if (p->flags & PF_KTHREAD) {
+		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
+			return;
+	} else {
+		if (stall_user_nth && !(++user_cnt % stall_user_nth))
+			return;
+	}
+
 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");
 
@@ -261,4 +272,5 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .init_task		= (void *)qmap_init_task,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
+	       .timeout_ms		= 5000U,
 	       .name			= "qmap");
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 7c84ade7ecfb..6e9e9726cd62 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -19,10 +19,12 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-b COUNT] [-p] [-v]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
+"  -t COUNT      Stall every COUNT'th user thread\n"
+"  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -v            Print libbpf debug messages\n"
@@ -55,7 +57,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:b:pvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:b:pvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -63,6 +65,12 @@ int main(int argc, char **argv)
 		case 'e':
 			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
 			break;
+		case 't':
+			skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
+			break;
+		case 'T':
+			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
+			break;
 		case 'b':
 			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
 			break;
-- 
cgit v1.2.3


From 7bb6f0810ecfb73a9d7a2ca56fb001e0201a6758 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:18 -1000
Subject: sched_ext: Allow BPF schedulers to disallow specific tasks from
 joining SCHED_EXT

BPF schedulers might not want to schedule certain tasks - e.g. kernel
threads. This patch adds p->scx.disallow which can be set by BPF schedulers
in such cases. The field can be changed anytime and setting it in
ops.prep_enable() guarantees that the task can never be scheduled by
sched_ext.

scx_qmap is updated with the -d option to disallow a specific PID:

  # echo $$
  1092
  # grep -E '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    0
  ext.enabled                                  :                    0
  # ./set-scx 1092
  # grep -E '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    7
  ext.enabled                                  :                    0

Run "scx_qmap -p -d 1092" in another terminal.

  # cat /sys/kernel/sched_ext/nr_rejected
  1
  # grep -E '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    0
  ext.enabled                                  :                    0
  # ./set-scx 1092
  setparam failed for 1092 (Permission denied)

- v4: Refreshed on top of tip:sched/core.

- v3: Update description to reflect /sys/kernel/sched_ext interface change.

- v2: Use atomic_long_t instead of atomic64_t for scx_kick_cpus_pnt_seqs to
      accommodate 32bit archs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h      | 12 ++++++++++
 kernel/sched/ext.c             | 50 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/ext.h             |  2 ++
 kernel/sched/syscalls.c        |  4 ++++
 tools/sched_ext/scx_qmap.bpf.c |  4 ++++
 tools/sched_ext/scx_qmap.c     | 11 ++++++++--
 6 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 96031252436f..ea7c501ac819 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -137,6 +137,18 @@ struct sched_ext_entity {
 	 */
 	u64			slice;
 
+	/*
+	 * If set, reject future sched_setscheduler(2) calls updating the policy
+	 * to %SCHED_EXT with -%EACCES.
+	 *
+	 * If set from ops.init_task() and the task's policy is already
+	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
+	 * or by inhering the parent's policy during fork, the task's policy is
+	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of
+	 * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
+	 */
+	bool			disallow;	/* reject switching into SCX */
+
 	/* cold fields */
 	/* must be the last field, see init_scx_entity() */
 	struct list_head	tasks_node;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3dc515b3351f..8ff30b80e862 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -483,6 +483,8 @@ struct static_key_false scx_has_op[SCX_OPI_END] =
 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
 static struct scx_exit_info *scx_exit_info;
 
+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
+
 /*
  * The maximum amount of time in jiffies that a task may be runnable without
  * being scheduled on a CPU. If this timeout is exceeded, it will trigger
@@ -2332,6 +2334,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
 {
 	int ret;
 
+	p->scx.disallow = false;
+
 	if (SCX_HAS_OP(init_task)) {
 		struct scx_init_task_args args = {
 			.fork = fork,
@@ -2346,6 +2350,27 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
 
 	scx_set_task_state(p, SCX_TASK_INIT);
 
+	if (p->scx.disallow) {
+		struct rq *rq;
+		struct rq_flags rf;
+
+		rq = task_rq_lock(p, &rf);
+
+		/*
+		 * We're either in fork or load path and @p->policy will be
+		 * applied right after. Reverting @p->policy here and rejecting
+		 * %SCHED_EXT transitions from scx_check_setscheduler()
+		 * guarantees that if ops.init_task() sets @p->disallow, @p can
+		 * never be in SCX.
+		 */
+		if (p->policy == SCHED_EXT) {
+			p->policy = SCHED_NORMAL;
+			atomic_long_inc(&scx_nr_rejected);
+		}
+
+		task_rq_unlock(rq, p, &rf);
+	}
+
 	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 	return 0;
 }
@@ -2549,6 +2574,18 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
+int scx_check_setscheduler(struct task_struct *p, int policy)
+{
+	lockdep_assert_rq_held(task_rq(p));
+
+	/* if disallow, reject transitioning into SCX */
+	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
+	    p->policy != policy && policy == SCHED_EXT)
+		return -EACCES;
+
+	return 0;
+}
+
 /*
  * Omitted operations:
  *
@@ -2703,9 +2740,17 @@ static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
 }
 SCX_ATTR(switch_all);
 
+static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
+					 struct kobj_attribute *ka, char *buf)
+{
+	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
+}
+SCX_ATTR(nr_rejected);
+
 static struct attribute *scx_global_attrs[] = {
 	&scx_attr_state.attr,
 	&scx_attr_switch_all.attr,
+	&scx_attr_nr_rejected.attr,
 	NULL,
 };
 
@@ -3178,6 +3223,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
 	scx_warned_zero_slice = false;
 
+	atomic_long_set(&scx_nr_rejected, 0);
+
 	/*
 	 * Keep CPUs stable during enable so that the BPF scheduler can track
 	 * online CPUs by watching ->on/offline_cpu() after ->init().
@@ -3476,6 +3523,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
 		if (off >= offsetof(struct task_struct, scx.slice) &&
 		    off + size <= offsetofend(struct task_struct, scx.slice))
 			return SCALAR_VALUE;
+		if (off >= offsetof(struct task_struct, scx.disallow) &&
+		    off + size <= offsetofend(struct task_struct, scx.disallow))
+			return SCALAR_VALUE;
 	}
 
 	return -EACCES;
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 56fcdb0b2c05..33a9f7fe5832 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -35,6 +35,7 @@ void scx_pre_fork(struct task_struct *p);
 int scx_fork(struct task_struct *p);
 void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
+int scx_check_setscheduler(struct task_struct *p, int policy);
 bool task_should_scx(struct task_struct *p);
 void init_sched_ext_class(void);
 
@@ -72,6 +73,7 @@ static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
 static inline bool task_on_scx(const struct task_struct *p) { return false; }
 static inline void init_sched_ext_class(void) {}
 
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 18d44d180db1..4fa59c9f69ac 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -714,6 +714,10 @@ recheck:
 		goto unlock;
 	}
 
+	retval = scx_check_setscheduler(p, policy);
+	if (retval)
+		goto unlock;
+
 	/*
 	 * If not changing anything there's no need to proceed further,
 	 * but store a possible modification of reset_on_fork.
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 8beae08dfdc7..5ff217c4bfa0 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -32,6 +32,7 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_batch;
+const volatile s32 disallow_tgid;
 
 u32 test_error_cnt;
 
@@ -243,6 +244,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 		   struct scx_init_task_args *args)
 {
+	if (p->tgid == disallow_tgid)
+		p->scx.disallow = true;
+
 	/*
 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
 	 * in this function and the following will automatically use GFP_KERNEL.
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 6e9e9726cd62..a2614994cfaa 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -19,13 +19,15 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT] [-p] [-v]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
+"       [-d PID] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
+"  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
@@ -57,7 +59,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:b:pvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -74,6 +76,11 @@ int main(int argc, char **argv)
 		case 'b':
 			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
 			break;
+		case 'd':
+			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
+			if (skel->rodata->disallow_tgid < 0)
+				skel->rodata->disallow_tgid = getpid();
+			break;
 		case 'p':
 			skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
 			break;
-- 
cgit v1.2.3


From 07814a9439a3b03d79a1001614b5bc1cab69bcec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:18 -1000
Subject: sched_ext: Print debug dump after an error exit

If a BPF scheduler triggers an error, the scheduler is aborted and the
system is reverted to the built-in scheduler. In the process, a lot of
information which may be useful for figuring out what happened can be lost.

This patch adds debug dump which captures information which may be useful
for debugging including runqueue and runnable thread states at the time of
failure. The following shows a debug dump after triggering the watchdog:

  root@test ~# os/work/tools/sched_ext/build/bin/scx_qmap -t 100
  stats  : enq=1 dsp=0 delta=1 deq=0
  stats  : enq=90 dsp=90 delta=0 deq=0
  stats  : enq=156 dsp=156 delta=0 deq=0
  stats  : enq=218 dsp=218 delta=0 deq=0
  stats  : enq=255 dsp=255 delta=0 deq=0
  stats  : enq=271 dsp=271 delta=0 deq=0
  stats  : enq=284 dsp=284 delta=0 deq=0
  stats  : enq=293 dsp=293 delta=0 deq=0

  DEBUG DUMP
  ================================================================================

  kworker/u32:12[320] triggered exit kind 1026:
    runnable task stall (stress[1530] failed to run for 6.841s)

  Backtrace:
    scx_watchdog_workfn+0x136/0x1c0
    process_scheduled_works+0x2b5/0x600
    worker_thread+0x269/0x360
    kthread+0xeb/0x110
    ret_from_fork+0x36/0x40
    ret_from_fork_asm+0x1a/0x30

  QMAP FIFO[0]:
  QMAP FIFO[1]:
  QMAP FIFO[2]: 1436
  QMAP FIFO[3]:
  QMAP FIFO[4]:

  CPU states
  ----------

  CPU 0   : nr_run=1 ops_qseq=244
	    curr=swapper/0[0] class=idle_sched_class

    QMAP: dsp_idx=1 dsp_cnt=0

    R stress[1530] -6841ms
	scx_state/flags=3/0x1 ops_state/qseq=2/20
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      QMAP: force_local=0

      asm_sysvec_apic_timer_interrupt+0x16/0x20

  CPU 2   : nr_run=2 ops_qseq=142
	    curr=swapper/2[0] class=idle_sched_class

    QMAP: dsp_idx=1 dsp_cnt=0

    R sshd[1703] -5905ms
	scx_state/flags=3/0x9 ops_state/qseq=2/88
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      QMAP: force_local=1

      __x64_sys_ppoll+0xf6/0x120
      do_syscall_64+0x7b/0x150
      entry_SYSCALL_64_after_hwframe+0x76/0x7e

    R fish[1539] -4141ms
	scx_state/flags=3/0x9 ops_state/qseq=2/124
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      QMAP: force_local=1

      futex_wait+0x60/0xe0
      do_futex+0x109/0x180
      __x64_sys_futex+0x117/0x190
      do_syscall_64+0x7b/0x150
      entry_SYSCALL_64_after_hwframe+0x76/0x7e

  CPU 3   : nr_run=2 ops_qseq=162
	    curr=kworker/u32:12[320] class=ext_sched_class

    QMAP: dsp_idx=1 dsp_cnt=0

   *R kworker/u32:12[320] +0ms
	scx_state/flags=3/0xd ops_state/qseq=0/0
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      QMAP: force_local=0

      scx_dump_state+0x613/0x6f0
      scx_ops_error_irq_workfn+0x1f/0x40
      irq_work_run_list+0x82/0xd0
      irq_work_run+0x14/0x30
      __sysvec_irq_work+0x40/0x140
      sysvec_irq_work+0x60/0x70
      asm_sysvec_irq_work+0x16/0x20
      scx_watchdog_workfn+0x15f/0x1c0
      process_scheduled_works+0x2b5/0x600
      worker_thread+0x269/0x360
      kthread+0xeb/0x110
      ret_from_fork+0x36/0x40
      ret_from_fork_asm+0x1a/0x30

    R kworker/3:2[1436] +0ms
	scx_state/flags=3/0x9 ops_state/qseq=2/160
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=08

      QMAP: force_local=0

      kthread+0xeb/0x110
      ret_from_fork+0x36/0x40
      ret_from_fork_asm+0x1a/0x30

  CPU 7   : nr_run=0 ops_qseq=76
	    curr=swapper/7[0] class=idle_sched_class


  ================================================================================

  EXIT: runnable task stall (stress[1530] failed to run for 6.841s)

It shows that CPU 3 was running the watchdog when it triggered the error
condition and the scx_qmap thread has been queued on CPU 0 for over 5
seconds but failed to run. It also prints out scx_qmap specific information
- e.g. which tasks are queued on each FIFO and so on using the dump_*() ops.
This dump has proved pretty useful for developing and debugging BPF
schedulers.

Debug dump is generated automatically when the BPF scheduler exits due to an
error. The debug buffer used in such cases is determined by
sched_ext_ops.exit_dump_len and defaults to 32k. If the debug dump overruns
the available buffer, the output is truncated and marked accordingly.

Debug dump output can also be read through the sched_ext_dump tracepoint.
When read through the tracepoint, there is no length limit.

SysRq-D can be used to trigger debug dump at any time while a BPF scheduler
is loaded. This is non-destructive - the scheduler keeps running afterwards.
The output can be read through the sched_ext_dump tracepoint.

v2: - The size of exit debug dump buffer can now be customized using
      sched_ext_ops.exit_dump_len.

    - sched_ext_ops.dump*() added to enable dumping of BPF scheduler
      specific information.

    - Tracpoint output and SysRq-D triggering added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 include/trace/events/sched_ext.h             |  32 ++
 kernel/sched/ext.c                           | 421 ++++++++++++++++++++++++++-
 tools/sched_ext/include/scx/common.bpf.h     |  12 +
 tools/sched_ext/include/scx/compat.h         |   9 +-
 tools/sched_ext/include/scx/user_exit_info.h |  19 ++
 tools/sched_ext/scx_qmap.bpf.c               |  54 ++++
 tools/sched_ext/scx_qmap.c                   |  14 +-
 tools/sched_ext/scx_simple.c                 |   2 +-
 8 files changed, 555 insertions(+), 8 deletions(-)
 create mode 100644 include/trace/events/sched_ext.h

(limited to 'tools')

diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
new file mode 100644
index 000000000000..fe19da7315a9
--- /dev/null
+++ b/include/trace/events/sched_ext.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sched_ext
+
+#if !defined(_TRACE_SCHED_EXT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCHED_EXT_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(sched_ext_dump,
+
+	TP_PROTO(const char *line),
+
+	TP_ARGS(line),
+
+	TP_STRUCT__entry(
+		__string(line, line)
+	),
+
+	TP_fast_assign(
+		__assign_str(line);
+	),
+
+	TP_printk("%s",
+		__get_str(line)
+	)
+);
+
+#endif /* _TRACE_SCHED_EXT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6f4de29d7372..66bb9cf075f0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -12,6 +12,7 @@ enum scx_consts {
 
 	SCX_EXIT_BT_LEN			= 64,
 	SCX_EXIT_MSG_LEN		= 1024,
+	SCX_EXIT_DUMP_DFL_LEN		= 32768,
 };
 
 enum scx_exit_kind {
@@ -48,6 +49,9 @@ struct scx_exit_info {
 
 	/* informational message */
 	char			*msg;
+
+	/* debug dump */
+	char			*dump;
 };
 
 /* sched_ext_ops.flags */
@@ -105,6 +109,17 @@ struct scx_exit_task_args {
 	bool cancelled;
 };
 
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+	enum scx_exit_kind	kind;
+	s64			exit_code;
+	const char		*reason;
+	u64			at_ns;
+	u64			at_jiffies;
+};
+
 /**
  * struct sched_ext_ops - Operation table for BPF scheduler implementation
  *
@@ -296,6 +311,36 @@ struct sched_ext_ops {
 	 */
 	void (*disable)(struct task_struct *p);
 
+	/**
+	 * dump - Dump BPF scheduler state on error
+	 * @ctx: debug dump context
+	 *
+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+	 */
+	void (*dump)(struct scx_dump_ctx *ctx);
+
+	/**
+	 * dump_cpu - Dump BPF scheduler state for a CPU on error
+	 * @ctx: debug dump context
+	 * @cpu: CPU to generate debug dump for
+	 * @idle: @cpu is currently idle without any runnable tasks
+	 *
+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+	 * @cpu. If @idle is %true and this operation doesn't produce any
+	 * output, @cpu is skipped for dump.
+	 */
+	void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+	/**
+	 * dump_task - Dump BPF scheduler state for a runnable task on error
+	 * @ctx: debug dump context
+	 * @p: runnable task to generate debug dump for
+	 *
+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+	 * @p.
+	 */
+	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
 	/*
 	 * All online ops must come before ops.init().
 	 */
@@ -330,6 +375,12 @@ struct sched_ext_ops {
 	 */
 	u32 timeout_ms;
 
+	/**
+	 * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
+	 * value of 32768 is used.
+	 */
+	u32 exit_dump_len;
+
 	/**
 	 * name - BPF scheduler's name
 	 *
@@ -567,10 +618,27 @@ struct scx_bstr_buf {
 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
 static struct scx_bstr_buf scx_exit_bstr_buf;
 
+/* ops debug dump */
+struct scx_dump_data {
+	s32			cpu;
+	bool			first;
+	s32			cursor;
+	struct seq_buf		*s;
+	const char		*prefix;
+	struct scx_bstr_buf	buf;
+};
+
+struct scx_dump_data scx_dump_data = {
+	.cpu			= -1,
+};
+
 /* /sys/kernel/sched_ext interface */
 static struct kset *scx_kset;
 static struct kobject *scx_root_kobj;
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched_ext.h>
+
 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
 					     s64 exit_code,
 					     const char *fmt, ...);
@@ -2897,12 +2965,13 @@ static void scx_ops_bypass(bool bypass)
 
 static void free_exit_info(struct scx_exit_info *ei)
 {
+	kfree(ei->dump);
 	kfree(ei->msg);
 	kfree(ei->bt);
 	kfree(ei);
 }
 
-static struct scx_exit_info *alloc_exit_info(void)
+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
 {
 	struct scx_exit_info *ei;
 
@@ -2912,8 +2981,9 @@ static struct scx_exit_info *alloc_exit_info(void)
 
 	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+	ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
 
-	if (!ei->bt || !ei->msg) {
+	if (!ei->bt || !ei->msg || !ei->dump) {
 		free_exit_info(ei);
 		return NULL;
 	}
@@ -3125,8 +3195,274 @@ static void scx_ops_disable(enum scx_exit_kind kind)
 	schedule_scx_ops_disable_work();
 }
 
+static void dump_newline(struct seq_buf *s)
+{
+	trace_sched_ext_dump("");
+
+	/* @s may be zero sized and seq_buf triggers WARN if so */
+	if (s->size)
+		seq_buf_putc(s, '\n');
+}
+
+static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
+{
+	va_list args;
+
+#ifdef CONFIG_TRACEPOINTS
+	if (trace_sched_ext_dump_enabled()) {
+		/* protected by scx_dump_state()::dump_lock */
+		static char line_buf[SCX_EXIT_MSG_LEN];
+
+		va_start(args, fmt);
+		vscnprintf(line_buf, sizeof(line_buf), fmt, args);
+		va_end(args);
+
+		trace_sched_ext_dump(line_buf);
+	}
+#endif
+	/* @s may be zero sized and seq_buf triggers WARN if so */
+	if (s->size) {
+		va_start(args, fmt);
+		seq_buf_vprintf(s, fmt, args);
+		va_end(args);
+
+		seq_buf_putc(s, '\n');
+	}
+}
+
+static void dump_stack_trace(struct seq_buf *s, const char *prefix,
+			     const unsigned long *bt, unsigned int len)
+{
+	unsigned int i;
+
+	for (i = 0; i < len; i++)
+		dump_line(s, "%s%pS", prefix, (void *)bt[i]);
+}
+
+static void ops_dump_init(struct seq_buf *s, const char *prefix)
+{
+	struct scx_dump_data *dd = &scx_dump_data;
+
+	lockdep_assert_irqs_disabled();
+
+	dd->cpu = smp_processor_id();		/* allow scx_bpf_dump() */
+	dd->first = true;
+	dd->cursor = 0;
+	dd->s = s;
+	dd->prefix = prefix;
+}
+
+static void ops_dump_flush(void)
+{
+	struct scx_dump_data *dd = &scx_dump_data;
+	char *line = dd->buf.line;
+
+	if (!dd->cursor)
+		return;
+
+	/*
+	 * There's something to flush and this is the first line. Insert a blank
+	 * line to distinguish ops dump.
+	 */
+	if (dd->first) {
+		dump_newline(dd->s);
+		dd->first = false;
+	}
+
+	/*
+	 * There may be multiple lines in $line. Scan and emit each line
+	 * separately.
+	 */
+	while (true) {
+		char *end = line;
+		char c;
+
+		while (*end != '\n' && *end != '\0')
+			end++;
+
+		/*
+		 * If $line overflowed, it may not have newline at the end.
+		 * Always emit with a newline.
+		 */
+		c = *end;
+		*end = '\0';
+		dump_line(dd->s, "%s%s", dd->prefix, line);
+		if (c == '\0')
+			break;
+
+		/* move to the next line */
+		end++;
+		if (*end == '\0')
+			break;
+		line = end;
+	}
+
+	dd->cursor = 0;
+}
+
+static void ops_dump_exit(void)
+{
+	ops_dump_flush();
+	scx_dump_data.cpu = -1;
+}
+
+static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
+			  struct task_struct *p, char marker)
+{
+	static unsigned long bt[SCX_EXIT_BT_LEN];
+	char dsq_id_buf[19] = "(n/a)";
+	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+	unsigned int bt_len;
+
+	if (p->scx.dsq)
+		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+			  (unsigned long long)p->scx.dsq->id);
+
+	dump_newline(s);
+	dump_line(s, " %c%c %s[%d] %+ldms",
+		  marker, task_state_to_char(p), p->comm, p->pid,
+		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
+	dump_line(s, "      scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu",
+		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
+		  ops_state & SCX_OPSS_STATE_MASK,
+		  ops_state >> SCX_OPSS_QSEQ_SHIFT);
+	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s",
+		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+
+	if (SCX_HAS_OP(dump_task)) {
+		ops_dump_init(s, "    ");
+		SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+		ops_dump_exit();
+	}
+
+	bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+	if (bt_len) {
+		dump_newline(s);
+		dump_stack_trace(s, "    ", bt, bt_len);
+	}
+}
+
+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+{
+	static DEFINE_SPINLOCK(dump_lock);
+	static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+	struct scx_dump_ctx dctx = {
+		.kind = ei->kind,
+		.exit_code = ei->exit_code,
+		.reason = ei->reason,
+		.at_ns = ktime_get_ns(),
+		.at_jiffies = jiffies,
+	};
+	struct seq_buf s;
+	unsigned long flags;
+	char *buf;
+	int cpu;
+
+	spin_lock_irqsave(&dump_lock, flags);
+
+	seq_buf_init(&s, ei->dump, dump_len);
+
+	if (ei->kind == SCX_EXIT_NONE) {
+		dump_line(&s, "Debug dump triggered by %s", ei->reason);
+	} else {
+		dump_line(&s, "%s[%d] triggered exit kind %d:",
+			  current->comm, current->pid, ei->kind);
+		dump_line(&s, "  %s (%s)", ei->reason, ei->msg);
+		dump_newline(&s);
+		dump_line(&s, "Backtrace:");
+		dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);
+	}
+
+	if (SCX_HAS_OP(dump)) {
+		ops_dump_init(&s, "");
+		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+		ops_dump_exit();
+	}
+
+	dump_newline(&s);
+	dump_line(&s, "CPU states");
+	dump_line(&s, "----------");
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct rq_flags rf;
+		struct task_struct *p;
+		struct seq_buf ns;
+		size_t avail, used;
+		bool idle;
+
+		rq_lock(rq, &rf);
+
+		idle = list_empty(&rq->scx.runnable_list) &&
+			rq->curr->sched_class == &idle_sched_class;
+
+		if (idle && !SCX_HAS_OP(dump_cpu))
+			goto next;
+
+		/*
+		 * We don't yet know whether ops.dump_cpu() will produce output
+		 * and we may want to skip the default CPU dump if it doesn't.
+		 * Use a nested seq_buf to generate the standard dump so that we
+		 * can decide whether to commit later.
+		 */
+		avail = seq_buf_get_buf(&s, &buf);
+		seq_buf_init(&ns, buf, avail);
+
+		dump_newline(&ns);
+		dump_line(&ns, "CPU %-4d: nr_run=%u ops_qseq=%lu",
+			  cpu, rq->scx.nr_running, rq->scx.ops_qseq);
+		dump_line(&ns, "          curr=%s[%d] class=%ps",
+			  rq->curr->comm, rq->curr->pid,
+			  rq->curr->sched_class);
+
+		used = seq_buf_used(&ns);
+		if (SCX_HAS_OP(dump_cpu)) {
+			ops_dump_init(&ns, "  ");
+			SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+			ops_dump_exit();
+		}
+
+		/*
+		 * If idle && nothing generated by ops.dump_cpu(), there's
+		 * nothing interesting. Skip.
+		 */
+		if (idle && used == seq_buf_used(&ns))
+			goto next;
+
+		/*
+		 * $s may already have overflowed when $ns was created. If so,
+		 * calling commit on it will trigger BUG.
+		 */
+		if (avail) {
+			seq_buf_commit(&s, seq_buf_used(&ns));
+			if (seq_buf_has_overflowed(&ns))
+				seq_buf_set_overflow(&s);
+		}
+
+		if (rq->curr->sched_class == &ext_sched_class)
+			scx_dump_task(&s, &dctx, rq->curr, '*');
+
+		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+			scx_dump_task(&s, &dctx, p, ' ');
+	next:
+		rq_unlock(rq, &rf);
+	}
+
+	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
+		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
+		       trunc_marker, sizeof(trunc_marker));
+
+	spin_unlock_irqrestore(&dump_lock, flags);
+}
+
 static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
 {
+	struct scx_exit_info *ei = scx_exit_info;
+
+	if (ei->kind >= SCX_EXIT_ERROR)
+		scx_dump_state(ei, scx_ops.exit_dump_len);
+
 	schedule_scx_ops_disable_work();
 }
 
@@ -3152,6 +3488,13 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
 	va_end(args);
 
+	/*
+	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+	 * in scx_ops_disable_workfn().
+	 */
+	ei->kind = kind;
+	ei->reason = scx_exit_reason(ei->kind);
+
 	irq_work_queue(&scx_ops_error_irq_work);
 }
 
@@ -3213,7 +3556,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	if (ret < 0)
 		goto err;
 
-	scx_exit_info = alloc_exit_info();
+	scx_exit_info = alloc_exit_info(ops->exit_dump_len);
 	if (!scx_exit_info) {
 		ret = -ENOMEM;
 		goto err_del;
@@ -3592,6 +3935,10 @@ static int bpf_scx_init_member(const struct btf_type *t,
 			return -E2BIG;
 		ops->timeout_ms = *(u32 *)(udata + moff);
 		return 1;
+	case offsetof(struct sched_ext_ops, exit_dump_len):
+		ops->exit_dump_len =
+			*(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
+		return 1;
 	}
 
 	return 0;
@@ -3723,6 +4070,21 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
 };
 
+static void sysrq_handle_sched_ext_dump(u8 key)
+{
+	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+
+	if (scx_enabled())
+		scx_dump_state(&ei, 0);
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
+	.handler	= sysrq_handle_sched_ext_dump,
+	.help_msg	= "dump-sched-ext(D)",
+	.action_msg	= "Trigger sched_ext debug dump",
+	.enable_mask	= SYSRQ_ENABLE_RTNICE,
+};
+
 /**
  * print_scx_info - print out sched_ext scheduler state
  * @log_lvl: the log level to use when printing
@@ -3793,6 +4155,7 @@ void __init init_sched_ext_class(void)
 	}
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+	register_sysrq_key('D', &sysrq_sched_ext_dump_op);
 	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
 }
 
@@ -4218,6 +4581,57 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
 	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
 }
 
+/**
+ * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
+ * @fmt: format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
+ * dump_task() to generate extra debug dump specific to the BPF scheduler.
+ *
+ * The extra dump may be multiple lines. A single line may be split over
+ * multiple calls. The last line is automatically terminated.
+ */
+__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
+				   u32 data__sz)
+{
+	struct scx_dump_data *dd = &scx_dump_data;
+	struct scx_bstr_buf *buf = &dd->buf;
+	s32 ret;
+
+	if (raw_smp_processor_id() != dd->cpu) {
+		scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+		return;
+	}
+
+	/* append the formatted string to the line buf */
+	ret = __bstr_format(buf->data, buf->line + dd->cursor,
+			    sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
+	if (ret < 0) {
+		dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
+			  dd->prefix, fmt, data, data__sz, ret);
+		return;
+	}
+
+	dd->cursor += ret;
+	dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
+
+	if (!dd->cursor)
+		return;
+
+	/*
+	 * If the line buf overflowed or ends in a newline, flush it into the
+	 * dump. This is to allow the caller to generate a single line over
+	 * multiple calls. As ops_dump_flush() can also handle multiple lines in
+	 * the line buf, the only case which can lead to an unexpected
+	 * truncation is when the caller keeps generating newlines in the middle
+	 * instead of the end consecutively. Don't do that.
+	 */
+	if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
+		ops_dump_flush();
+}
+
 /**
  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
  *
@@ -4426,6 +4840,7 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 833fe1bdccf9..3ea5cdf58bc7 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -38,6 +38,7 @@ s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
 void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
 u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
 const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
 const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
@@ -97,6 +98,17 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
 	___scx_bpf_bstr_format_checker(fmt, ##args);				\
 })
 
+/*
+ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments
+ * instead of an array of u64. To be used from ops.dump() and friends.
+ */
+#define scx_bpf_dump(fmt, args...)						\
+({										\
+	scx_bpf_bstr_preamble(fmt, args)					\
+	scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param));			\
+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
+})
+
 #define BPF_STRUCT_OPS(name, args...)						\
 SEC("struct_ops/"#name)								\
 BPF_PROG(name, ##args)
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index a7fdaf8a858e..c58024c980c8 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -111,16 +111,23 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
  * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
  * and attach it, backward compatibility is automatically maintained where
  * reasonable.
+ *
+ * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
+ * the current minimum required kernel version.
  */
 #define SCX_OPS_OPEN(__ops_name, __scx_name) ({					\
 	struct __scx_name *__skel;						\
 										\
+	SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"),		\
+		   "sched_ext_ops.dump() missing, kernel too old?");		\
+										\
 	__skel = __scx_name##__open();						\
 	SCX_BUG_ON(!__skel, "Could not open " #__scx_name);			\
 	__skel; 								\
 })
 
-#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({				\
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({		\
+	UEI_SET_SIZE(__skel, __ops_name, __uei_name);				\
 	SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel");	\
 })
 
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 8c3b7fac4d05..c2ef85c645e1 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -13,6 +13,7 @@
 enum uei_sizes {
 	UEI_REASON_LEN		= 128,
 	UEI_MSG_LEN		= 1024,
+	UEI_DUMP_DFL_LEN	= 32768,
 };
 
 struct user_exit_info {
@@ -28,6 +29,8 @@ struct user_exit_info {
 #include <bpf/bpf_core_read.h>
 
 #define UEI_DEFINE(__name)							\
+	char RESIZABLE_ARRAY(data, __name##_dump);				\
+	const volatile u32 __name##_dump_len;					\
 	struct user_exit_info __name SEC(".data")
 
 #define UEI_RECORD(__uei_name, __ei) ({						\
@@ -35,6 +38,8 @@ struct user_exit_info {
 				  sizeof(__uei_name.reason), (__ei)->reason);	\
 	bpf_probe_read_kernel_str(__uei_name.msg,				\
 				  sizeof(__uei_name.msg), (__ei)->msg);		\
+	bpf_probe_read_kernel_str(__uei_name##_dump,				\
+				  __uei_name##_dump_len, (__ei)->dump);		\
 	if (bpf_core_field_exists((__ei)->exit_code))				\
 		__uei_name.exit_code = (__ei)->exit_code;			\
 	/* use __sync to force memory barrier */				\
@@ -47,6 +52,13 @@ struct user_exit_info {
 #include <stdio.h>
 #include <stdbool.h>
 
+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({					\
+	u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN;	\
+	(__skel)->rodata->__uei_name##_dump_len = __len;				\
+	RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len);				\
+})
+
 #define UEI_EXITED(__skel, __uei_name) ({					\
 	/* use __sync to force memory barrier */				\
 	__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1);	\
@@ -54,6 +66,13 @@ struct user_exit_info {
 
 #define UEI_REPORT(__skel, __uei_name) ({					\
 	struct user_exit_info *__uei = &(__skel)->data->__uei_name;		\
+	char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
+	if (__uei_dump[0] != '\0') {						\
+		fputs("\nDEBUG DUMP\n", stderr);				\
+		fputs("================================================================================\n\n", stderr); \
+		fputs(__uei_dump, stderr);					\
+		fputs("\n================================================================================\n\n", stderr); \
+	}									\
 	fprintf(stderr, "EXIT: %s", __uei->reason);				\
 	if (__uei->msg[0] != '\0')						\
 		fprintf(stderr, " (%s)", __uei->msg);				\
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 5ff217c4bfa0..5b3da28bf042 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -33,6 +33,7 @@ const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_batch;
 const volatile s32 disallow_tgid;
+const volatile bool suppress_dump;
 
 u32 test_error_cnt;
 
@@ -258,6 +259,56 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 		return -ENOMEM;
 }
 
+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+{
+	s32 i, pid;
+
+	if (suppress_dump)
+		return;
+
+	bpf_for(i, 0, 5) {
+		void *fifo;
+
+		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
+			return;
+
+		scx_bpf_dump("QMAP FIFO[%d]:", i);
+		bpf_repeat(4096) {
+			if (bpf_map_pop_elem(fifo, &pid))
+				break;
+			scx_bpf_dump(" %d", pid);
+		}
+		scx_bpf_dump("\n");
+	}
+}
+
+void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
+{
+	u32 zero = 0;
+	struct cpu_ctx *cpuc;
+
+	if (suppress_dump || idle)
+		return;
+	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
+		return;
+
+	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
+		     cpuc->dsp_idx, cpuc->dsp_cnt);
+}
+
+void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
+{
+	struct task_ctx *taskc;
+
+	if (suppress_dump)
+		return;
+	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+		return;
+
+	scx_bpf_dump("QMAP: force_local=%d",
+		     taskc->force_local);
+}
+
 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 {
 	return scx_bpf_create_dsq(SHARED_DSQ, -1);
@@ -274,6 +325,9 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .dequeue			= (void *)qmap_dequeue,
 	       .dispatch		= (void *)qmap_dispatch,
 	       .init_task		= (void *)qmap_init_task,
+	       .dump			= (void *)qmap_dump,
+	       .dump_cpu		= (void *)qmap_dump_cpu,
+	       .dump_task		= (void *)qmap_dump_task,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
 	       .timeout_ms		= 5000U,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index a2614994cfaa..a1123a17581b 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
-"       [-d PID] [-p] [-v]\n"
+"       [-d PID] [-D LEN] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,6 +28,8 @@ const char help_fmt[] =
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+"  -D LEN        Set scx_exit_info.dump buffer length\n"
+"  -S            Suppress qmap-specific debug dump\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
@@ -59,7 +61,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -81,6 +83,12 @@ int main(int argc, char **argv)
 			if (skel->rodata->disallow_tgid < 0)
 				skel->rodata->disallow_tgid = getpid();
 			break;
+		case 'D':
+			skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
+			break;
+		case 'S':
+			skel->rodata->suppress_dump = true;
+			break;
 		case 'p':
 			skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
 			break;
@@ -93,7 +101,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap);
+	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
 	link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
 
 	while (!exit_req && !UEI_EXITED(skel, uei)) {
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 789ac62fea8e..7f500d1d56ac 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	SCX_OPS_LOAD(skel, simple_ops, scx_simple);
+	SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
 	link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple);
 
 	while (!exit_req && !UEI_EXITED(skel, uei)) {
-- 
cgit v1.2.3


From 1c3ae1cb2f2cf245d529fe363ce420ec8028a547 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:18 -1000
Subject: tools/sched_ext: Add scx_show_state.py

There are states which are interesting but don't quite fit the interface
exposed under /sys/kernel/sched_ext. Add tools/scx_show_state.py to show
them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 tools/sched_ext/scx_show_state.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 tools/sched_ext/scx_show_state.py

(limited to 'tools')

diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
new file mode 100644
index 000000000000..d457d2a74e1e
--- /dev/null
+++ b/tools/sched_ext/scx_show_state.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2024 Tejun Heo <tj@kernel.org>
+# Copyright (C) 2024 Meta Platforms, Inc. and affiliates.
+
+desc = """
+This is a drgn script to show the current sched_ext state.
+For more info on drgn, visit https://github.com/osandov/drgn.
+"""
+
+import drgn
+import sys
+
+def err(s):
+    print(s, file=sys.stderr, flush=True)
+    sys.exit(1)
+
+def read_int(name):
+    return int(prog[name].value_())
+
+def read_atomic(name):
+    return prog[name].counter.value_()
+
+def read_static_key(name):
+    return prog[name].key.enabled.counter.value_()
+
+def ops_state_str(state):
+    return prog['scx_ops_enable_state_str'][state].string_().decode()
+
+ops = prog['scx_ops']
+enable_state = read_atomic("scx_ops_enable_state_var")
+
+print(f'ops           : {ops.name.string_().decode()}')
+print(f'enabled       : {read_static_key("__scx_ops_enabled")}')
+print(f'switching_all : {read_int("scx_switching_all")}')
+print(f'switched_all  : {read_static_key("__scx_switched_all")}')
+print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
+print(f'bypass_depth  : {read_atomic("scx_ops_bypass_depth")}')
+print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
-- 
cgit v1.2.3


From 81aae789181b5850d77dfdf74d4b85c63f0705e9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:19 -1000
Subject: sched_ext: Implement scx_bpf_kick_cpu() and task preemption support

It's often useful to wake up and/or trigger reschedule on other CPUs. This
patch adds scx_bpf_kick_cpu() kfunc helper that BPF scheduler can call to
kick the target CPU into the scheduling path.

As a sched_ext task relinquishes its CPU only after its slice is depleted,
this patch also adds SCX_KICK_PREEMPT and SCX_ENQ_PREEMPT which clears the
slice of the target CPU's current task to guarantee that sched_ext's
scheduling path runs on the CPU.

If SCX_KICK_IDLE is specified, the target CPU is kicked iff the CPU is idle
to guarantee that the target CPU will go through at least one full sched_ext
scheduling cycle after the kicking. This can be used to wake up idle CPUs
without incurring unnecessary overhead if it isn't currently idle.

As a demonstration of how backward compatibility can be supported using BPF
CO-RE, tools/sched_ext/include/scx/compat.bpf.h is added. It provides
__COMPAT_scx_bpf_kick_cpu_IDLE() which uses SCX_KICK_IDLE if available or
becomes a regular kicking otherwise. This allows schedulers to use the new
SCX_KICK_IDLE while maintaining support for older kernels. The plan is to
temporarily use compat helpers to ease API updates and drop them after a few
kernel releases.

v5: - SCX_KICK_IDLE added. Note that this also adds a compat mechanism for
      schedulers so that they can support kernels without SCX_KICK_IDLE.
      This is useful as a demonstration of how new feature flags can be
      added in a backward compatible way.

    - kick_cpus_irq_workfn() reimplemented so that it touches the pending
      cpumasks only as necessary to reduce kicking overhead on machines with
      a lot of CPUs.

    - tools/sched_ext/include/scx/compat.bpf.h added.

v4: - Move example scheduler to its own patch.

v3: - Make scx_example_central switch all tasks by default.

    - Convert to BPF inline iterators.

v2: - Julia Lawall reported that scx_example_central can overflow the
      dispatch buffer and malfunction. As scheduling for other CPUs can't be
      handled by the automatic retry mechanism, fix by implementing an
      explicit overflow and retry handling.

    - Updated to use generic BPF cpumask helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h                |   4 +
 kernel/sched/ext.c                       | 225 +++++++++++++++++++++++++++++--
 kernel/sched/sched.h                     |  10 ++
 tools/sched_ext/include/scx/common.bpf.h |   1 +
 4 files changed, 227 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 85fb5dc725ef..3b2809b980ac 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -134,6 +134,10 @@ struct sched_ext_entity {
 	 * scx_bpf_dispatch() but can also be modified directly by the BPF
 	 * scheduler. Automatically decreased by SCX as the task executes. On
 	 * depletion, a scheduling event is triggered.
+	 *
+	 * This value is cleared to zero if the task is preempted by
+	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
+	 * task ran. Use p->se.sum_exec_runtime instead.
 	 */
 	u64			slice;
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 66bb9cf075f0..213793d086d7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -412,6 +412,14 @@ enum scx_enq_flags {
 
 	/* high 32bits are SCX specific */
 
+	/*
+	 * Set the following to trigger preemption when calling
+	 * scx_bpf_dispatch() with a local dsq as the target. The slice of the
+	 * current task is cleared to zero and the CPU is kicked into the
+	 * scheduling path. Implies %SCX_ENQ_HEAD.
+	 */
+	SCX_ENQ_PREEMPT		= 1LLU << 32,
+
 	/*
 	 * The task being enqueued is the only task available for the cpu. By
 	 * default, ext core keeps executing such tasks but when
@@ -441,6 +449,24 @@ enum scx_pick_idle_cpu_flags {
 	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
 };
 
+enum scx_kick_flags {
+	/*
+	 * Kick the target CPU if idle. Guarantees that the target CPU goes
+	 * through at least one full scheduling cycle before going idle. If the
+	 * target CPU can be determined to be currently not idle and going to go
+	 * through a scheduling cycle before going idle, noop.
+	 */
+	SCX_KICK_IDLE		= 1LLU << 0,
+
+	/*
+	 * Preempt the current task and execute the dispatch path. If the
+	 * current task of the target CPU is an SCX task, its ->scx.slice is
+	 * cleared to zero before the scheduling path is invoked so that the
+	 * task expires and the dispatch path is invoked.
+	 */
+	SCX_KICK_PREEMPT	= 1LLU << 1,
+};
+
 enum scx_ops_enable_state {
 	SCX_OPS_PREPPING,
 	SCX_OPS_ENABLING,
@@ -1019,7 +1045,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 		}
 	}
 
-	if (enq_flags & SCX_ENQ_HEAD)
+	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
 		list_add(&p->scx.dsq_node, &dsq->list);
 	else
 		list_add_tail(&p->scx.dsq_node, &dsq->list);
@@ -1045,8 +1071,16 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 
 	if (is_local) {
 		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+		bool preempt = false;
+
+		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+		    rq->curr->sched_class == &ext_sched_class) {
+			rq->curr->scx.slice = 0;
+			preempt = true;
+		}
 
-		if (sched_class_above(&ext_sched_class, rq->curr->sched_class))
+		if (preempt || sched_class_above(&ext_sched_class,
+						 rq->curr->sched_class))
 			resched_curr(rq);
 	} else {
 		raw_spin_unlock(&dsq->lock);
@@ -1872,8 +1906,10 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
+	bool has_tasks = false;
 
 	lockdep_assert_rq_held(rq);
+	rq->scx.flags |= SCX_RQ_BALANCING;
 
 	if (prev_on_scx) {
 		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
@@ -1890,19 +1926,19 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
 		    prev->scx.slice && !scx_ops_bypassing()) {
 			prev->scx.flags |= SCX_TASK_BAL_KEEP;
-			return 1;
+			goto has_tasks;
 		}
 	}
 
 	/* if there already are tasks to run, nothing to do */
 	if (rq->scx.local_dsq.nr)
-		return 1;
+		goto has_tasks;
 
 	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
-		return 1;
+		goto has_tasks;
 
 	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
-		return 0;
+		goto out;
 
 	dspc->rq = rq;
 	dspc->rf = rf;
@@ -1923,12 +1959,18 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 		flush_dispatch_buf(rq, rf);
 
 		if (rq->scx.local_dsq.nr)
-			return 1;
+			goto has_tasks;
 		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
-			return 1;
+			goto has_tasks;
 	} while (dspc->nr_tasks);
 
-	return 0;
+	goto out;
+
+has_tasks:
+	has_tasks = true;
+out:
+	rq->scx.flags &= ~SCX_RQ_BALANCING;
+	return has_tasks;
 }
 
 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
@@ -2666,7 +2708,8 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
  * Omitted operations:
  *
  * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
- *   isn't tied to the CPU at that point.
+ *   isn't tied to the CPU at that point. Preemption is implemented by resetting
+ *   the victim task's slice to 0 and triggering reschedule on the target CPU.
  *
  * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
  *
@@ -2902,6 +2945,9 @@ bool task_should_scx(struct task_struct *p)
  *    of the queue.
  *
  * d. pick_next_task() suppresses zero slice warning.
+ *
+ * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ *    operations.
  */
 static void scx_ops_bypass(bool bypass)
 {
@@ -3410,11 +3456,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		seq_buf_init(&ns, buf, avail);
 
 		dump_newline(&ns);
-		dump_line(&ns, "CPU %-4d: nr_run=%u ops_qseq=%lu",
-			  cpu, rq->scx.nr_running, rq->scx.ops_qseq);
+		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu",
+			  cpu, rq->scx.nr_running, rq->scx.flags,
+			  rq->scx.ops_qseq);
 		dump_line(&ns, "          curr=%s[%d] class=%ps",
 			  rq->curr->comm, rq->curr->pid,
 			  rq->curr->sched_class);
+		if (!cpumask_empty(rq->scx.cpus_to_kick))
+			dump_line(&ns, "  cpus_to_kick   : %*pb",
+				  cpumask_pr_args(rq->scx.cpus_to_kick));
+		if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
+			dump_line(&ns, "  idle_to_kick   : %*pb",
+				  cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
+		if (!cpumask_empty(rq->scx.cpus_to_preempt))
+			dump_line(&ns, "  cpus_to_preempt: %*pb",
+				  cpumask_pr_args(rq->scx.cpus_to_preempt));
 
 		used = seq_buf_used(&ns);
 		if (SCX_HAS_OP(dump_cpu)) {
@@ -4085,6 +4141,82 @@ static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
 };
 
+static bool can_skip_idle_kick(struct rq *rq)
+{
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * We can skip idle kicking if @rq is going to go through at least one
+	 * full SCX scheduling cycle before going idle. Just checking whether
+	 * curr is not idle is insufficient because we could be racing
+	 * balance_one() trying to pull the next task from a remote rq, which
+	 * may fail, and @rq may become idle afterwards.
+	 *
+	 * The race window is small and we don't and can't guarantee that @rq is
+	 * only kicked while idle anyway. Skip only when sure.
+	 */
+	return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING);
+}
+
+static void kick_one_cpu(s32 cpu, struct rq *this_rq)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct scx_rq *this_scx = &this_rq->scx;
+	unsigned long flags;
+
+	raw_spin_rq_lock_irqsave(rq, flags);
+
+	/*
+	 * During CPU hotplug, a CPU may depend on kicking itself to make
+	 * forward progress. Allow kicking self regardless of online state.
+	 */
+	if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
+		if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
+			if (rq->curr->sched_class == &ext_sched_class)
+				rq->curr->scx.slice = 0;
+			cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+		}
+
+		resched_curr(rq);
+	} else {
+		cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+	}
+
+	raw_spin_rq_unlock_irqrestore(rq, flags);
+}
+
+static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	raw_spin_rq_lock_irqsave(rq, flags);
+
+	if (!can_skip_idle_kick(rq) &&
+	    (cpu_online(cpu) || cpu == cpu_of(this_rq)))
+		resched_curr(rq);
+
+	raw_spin_rq_unlock_irqrestore(rq, flags);
+}
+
+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
+{
+	struct rq *this_rq = this_rq();
+	struct scx_rq *this_scx = &this_rq->scx;
+	s32 cpu;
+
+	for_each_cpu(cpu, this_scx->cpus_to_kick) {
+		kick_one_cpu(cpu, this_rq);
+		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
+		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
+	}
+
+	for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
+		kick_one_cpu_if_idle(cpu, this_rq);
+		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
+	}
+}
+
 /**
  * print_scx_info - print out sched_ext scheduler state
  * @log_lvl: the log level to use when printing
@@ -4139,7 +4271,7 @@ void __init init_sched_ext_class(void)
 	 * definitions so that BPF scheduler implementations can use them
 	 * through the generated vmlinux.h.
 	 */
-	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP);
+	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT);
 
 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
 	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
@@ -4152,6 +4284,11 @@ void __init init_sched_ext_class(void)
 
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
 		INIT_LIST_HEAD(&rq->scx.runnable_list);
+
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
 	}
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
@@ -4438,6 +4575,67 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
 
 __bpf_kfunc_start_defs();
 
+/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+	struct rq *this_rq;
+	unsigned long irq_flags;
+
+	if (!ops_cpu_valid(cpu, NULL))
+		return;
+
+	/*
+	 * While bypassing for PM ops, IRQ handling may not be online which can
+	 * lead to irq_work_queue() malfunction such as infinite busy wait for
+	 * IRQ status update. Suppress kicking.
+	 */
+	if (scx_ops_bypassing())
+		return;
+
+	local_irq_save(irq_flags);
+
+	this_rq = this_rq();
+
+	/*
+	 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
+	 * rq locks. We can probably be smarter and avoid bouncing if called
+	 * from ops which don't hold a rq lock.
+	 */
+	if (flags & SCX_KICK_IDLE) {
+		struct rq *target_rq = cpu_rq(cpu);
+
+		if (unlikely(flags & SCX_KICK_PREEMPT))
+			scx_ops_error("PREEMPT cannot be used with SCX_KICK_IDLE");
+
+		if (raw_spin_rq_trylock(target_rq)) {
+			if (can_skip_idle_kick(target_rq)) {
+				raw_spin_rq_unlock(target_rq);
+				goto out;
+			}
+			raw_spin_rq_unlock(target_rq);
+		}
+		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
+	} else {
+		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
+
+		if (flags & SCX_KICK_PREEMPT)
+			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
+	}
+
+	irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
+out:
+	local_irq_restore(irq_flags);
+}
+
 /**
  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
  * @dsq_id: id of the DSQ
@@ -4836,6 +5034,7 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_any)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2960e153c3a7..d9054eb4ba82 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -724,12 +724,22 @@ struct cfs_rq {
 };
 
 #ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+	SCX_RQ_BALANCING	= 1 << 1,
+};
+
 struct scx_rq {
 	struct scx_dispatch_q	local_dsq;
 	struct list_head	runnable_list;		/* runnable tasks on this rq */
 	unsigned long		ops_qseq;
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
+	u32			flags;
+	cpumask_var_t		cpus_to_kick;
+	cpumask_var_t		cpus_to_kick_if_idle;
+	cpumask_var_t		cpus_to_preempt;
+	struct irq_work		kick_cpus_irq_work;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */
 
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 3ea5cdf58bc7..421118bc56ff 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -34,6 +34,7 @@ void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flag
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
 void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
-- 
cgit v1.2.3


From 037df2a314a0f2b1c6778602909dc57e5f1790f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:19 -1000
Subject: sched_ext: Add a central scheduler which makes all scheduling
 decisions on one CPU

This patch adds a new example scheduler, scx_central, which demonstrates
central scheduling where one CPU is responsible for making all scheduling
decisions in the system using scx_bpf_kick_cpu(). The central CPU makes
scheduling decisions for all CPUs in the system, queues tasks on the
appropriate local dsq's and preempts the worker CPUs. The worker CPUs in
turn preempt the central CPU when it needs tasks to run.

Currently, every CPU depends on its own tick to expire the current task. A
follow-up patch implementing tickless support for sched_ext will allow the
worker CPUs to go full tickless so that they can run completely undisturbed.

v3: - Kumar fixed a bug where the dispatch path could overflow the dispatch
      buffer if too many are dispatched to the fallback DSQ.

    - Use the new SCX_KICK_IDLE to wake up non-central CPUs.

    - Dropped '-p' option.

v2: - Use RESIZABLE_ARRAY() instead of fixed MAX_CPUS and use SCX_BUG[_ON]()
      to simplify error handling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
---
 tools/sched_ext/Makefile          |   2 +-
 tools/sched_ext/scx_central.bpf.c | 214 ++++++++++++++++++++++++++++++++++++++
 tools/sched_ext/scx_central.c     | 105 +++++++++++++++++++
 3 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/scx_central.bpf.c
 create mode 100644 tools/sched_ext/scx_central.c

(limited to 'tools')

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 626782a21375..bf7e108f5ae1 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -176,7 +176,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
-c-sched-targets = scx_simple scx_qmap
+c-sched-targets = scx_simple scx_qmap scx_central
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
new file mode 100644
index 000000000000..428b2262faa3
--- /dev/null
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -0,0 +1,214 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A central FIFO sched_ext scheduler which demonstrates the followings:
+ *
+ * a. Making all scheduling decisions from one CPU:
+ *
+ *    The central CPU is the only one making scheduling decisions. All other
+ *    CPUs kick the central CPU when they run out of tasks to run.
+ *
+ *    There is one global BPF queue and the central CPU schedules all CPUs by
+ *    dispatching from the global queue to each CPU's local dsq from dispatch().
+ *    This isn't the most straightforward. e.g. It'd be easier to bounce
+ *    through per-CPU BPF queues. The current design is chosen to maximally
+ *    utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
+ *
+ * b. Preemption
+ *
+ *    SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
+ *    next tasks.
+ *
+ * This scheduler is designed to maximize usage of various SCX mechanisms. A
+ * more practical implementation would likely put the scheduling loop outside
+ * the central CPU's dispatch() path and add some form of priority mechanism.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+enum {
+	FALLBACK_DSQ_ID		= 0,
+};
+
+const volatile s32 central_cpu;
+const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
+const volatile u64 slice_ns = SCX_SLICE_DFL;
+
+u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
+u64 nr_dispatches, nr_mismatches, nr_retries;
+u64 nr_overflows;
+
+UEI_DEFINE(uei);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 4096);
+	__type(value, s32);
+} central_q SEC(".maps");
+
+/* can't use percpu map due to bad lookups */
+bool RESIZABLE_ARRAY(data, cpu_gimme_task);
+
+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/*
+	 * Steer wakeups to the central CPU as much as possible to avoid
+	 * disturbing other CPUs. It's safe to blindly return the central cpu as
+	 * select_cpu() is a hint and if @p can't be on it, the kernel will
+	 * automatically pick a fallback CPU.
+	 */
+	return central_cpu;
+}
+
+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	s32 pid = p->pid;
+
+	__sync_fetch_and_add(&nr_total, 1);
+
+	if (bpf_map_push_elem(&central_q, &pid, 0)) {
+		__sync_fetch_and_add(&nr_overflows, 1);
+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+		return;
+	}
+
+	__sync_fetch_and_add(&nr_queued, 1);
+
+	if (!scx_bpf_task_running(p))
+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+}
+
+static bool dispatch_to_cpu(s32 cpu)
+{
+	struct task_struct *p;
+	s32 pid;
+
+	bpf_repeat(BPF_MAX_LOOPS) {
+		if (bpf_map_pop_elem(&central_q, &pid))
+			break;
+
+		__sync_fetch_and_sub(&nr_queued, 1);
+
+		p = bpf_task_from_pid(pid);
+		if (!p) {
+			__sync_fetch_and_add(&nr_lost_pids, 1);
+			continue;
+		}
+
+		/*
+		 * If we can't run the task at the top, do the dumb thing and
+		 * bounce it to the fallback dsq.
+		 */
+		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+			__sync_fetch_and_add(&nr_mismatches, 1);
+			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+			bpf_task_release(p);
+			/*
+			 * We might run out of dispatch buffer slots if we continue dispatching
+			 * to the fallback DSQ, without dispatching to the local DSQ of the
+			 * target CPU. In such a case, break the loop now as will fail the
+			 * next dispatch operation.
+			 */
+			if (!scx_bpf_dispatch_nr_slots())
+				break;
+			continue;
+		}
+
+		/* dispatch to local and mark that @cpu doesn't need more */
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+
+		if (cpu != central_cpu)
+			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+
+		bpf_task_release(p);
+		return true;
+	}
+
+	return false;
+}
+
+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (cpu == central_cpu) {
+		/* dispatch for all other CPUs first */
+		__sync_fetch_and_add(&nr_dispatches, 1);
+
+		bpf_for(cpu, 0, nr_cpu_ids) {
+			bool *gimme;
+
+			if (!scx_bpf_dispatch_nr_slots())
+				break;
+
+			/* central's gimme is never set */
+			gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
+			if (gimme && !*gimme)
+				continue;
+
+			if (dispatch_to_cpu(cpu))
+				*gimme = false;
+		}
+
+		/*
+		 * Retry if we ran out of dispatch buffer slots as we might have
+		 * skipped some CPUs and also need to dispatch for self. The ext
+		 * core automatically retries if the local dsq is empty but we
+		 * can't rely on that as we're dispatching for other CPUs too.
+		 * Kick self explicitly to retry.
+		 */
+		if (!scx_bpf_dispatch_nr_slots()) {
+			__sync_fetch_and_add(&nr_retries, 1);
+			scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+			return;
+		}
+
+		/* look for a task to run on the central CPU */
+		if (scx_bpf_consume(FALLBACK_DSQ_ID))
+			return;
+		dispatch_to_cpu(central_cpu);
+	} else {
+		bool *gimme;
+
+		if (scx_bpf_consume(FALLBACK_DSQ_ID))
+			return;
+
+		gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
+		if (gimme)
+			*gimme = true;
+
+		/*
+		 * Force dispatch on the scheduling CPU so that it finds a task
+		 * to run for us.
+		 */
+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+	}
+}
+
+int BPF_STRUCT_OPS_SLEEPABLE(central_init)
+{
+	return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+}
+
+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(central_ops,
+	       /*
+		* We are offloading all scheduling decisions to the central CPU
+		* and thus being the last task on a given CPU doesn't mean
+		* anything special. Enqueue the last tasks like any other tasks.
+		*/
+	       .flags			= SCX_OPS_ENQ_LAST,
+
+	       .select_cpu		= (void *)central_select_cpu,
+	       .enqueue			= (void *)central_enqueue,
+	       .dispatch		= (void *)central_dispatch,
+	       .init			= (void *)central_init,
+	       .exit			= (void *)central_exit,
+	       .name			= "central");
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
new file mode 100644
index 000000000000..5f09fc666a63
--- /dev/null
+++ b/tools/sched_ext/scx_central.c
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_central.bpf.skel.h"
+
+const char help_fmt[] =
+"A central FIFO sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-c CPU]\n"
+"\n"
+"  -s SLICE_US   Override slice duration\n"
+"  -c CPU        Override the central CPU (default: 0)\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_central *skel;
+	struct bpf_link *link;
+	__u64 seq = 0;
+	__s32 opt;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	skel = SCX_OPS_OPEN(central_ops, scx_central);
+
+	skel->rodata->central_cpu = 0;
+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+
+	while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
+		switch (opt) {
+		case 's':
+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
+			break;
+		case 'c':
+			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	/* Resize arrays so their element count is equal to cpu count. */
+	RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
+
+	SCX_OPS_LOAD(skel, central_ops, scx_central, uei);
+	link = SCX_OPS_ATTACH(skel, central_ops, scx_central);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		printf("[SEQ %llu]\n", seq++);
+		printf("total   :%10" PRIu64 "    local:%10" PRIu64 "   queued:%10" PRIu64 "  lost:%10" PRIu64 "\n",
+		       skel->bss->nr_total,
+		       skel->bss->nr_locals,
+		       skel->bss->nr_queued,
+		       skel->bss->nr_lost_pids);
+		printf("                    dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
+		       skel->bss->nr_dispatches,
+		       skel->bss->nr_mismatches,
+		       skel->bss->nr_retries);
+		printf("overflow:%10" PRIu64 "\n",
+		       skel->bss->nr_overflows);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	UEI_REPORT(skel, uei);
+	scx_central__destroy(skel);
+	return 0;
+}
-- 
cgit v1.2.3


From 0922f54fdd15aedb93730eb8cfa0c069cbad4e08 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:19 -1000
Subject: sched_ext: Make watchdog handle ops.dispatch() looping stall

The dispatch path retries if the local DSQ is still empty after
ops.dispatch() either dispatched or consumed a task. This is both out of
necessity and for convenience. It has to retry because the dispatch path
might lose the tasks to dequeue while the rq lock is released while trying
to migrate tasks across CPUs, and the retry mechanism makes ops.dispatch()
implementation easier as it only needs to make some forward progress each
iteration.

However, this makes it possible for ops.dispatch() to stall CPUs by
repeatedly dispatching ineligible tasks. If all CPUs are stalled that way,
the watchdog or sysrq handler can't run and the system can't be saved. Let's
address the issue by breaking out of the dispatch loop after 32 iterations.

It is unlikely but not impossible for ops.dispatch() to legitimately go over
the iteration limit. We want to come back to the dispatch path in such cases
as not doing so risks stalling the CPU by idling with runnable tasks
pending. As the previous task is still current in balance_scx(),
resched_curr() doesn't do anything - it will just get cleared. Let's instead
use scx_kick_bpf() which will trigger reschedule after switching to the next
task which will likely be the idle task.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 kernel/sched/ext.c             | 17 +++++++++++++++++
 tools/sched_ext/scx_qmap.bpf.c | 15 +++++++++++++++
 tools/sched_ext/scx_qmap.c     |  8 ++++++--
 3 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 213793d086d7..89bcca84d6b5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8,6 +8,7 @@
 
 enum scx_consts {
 	SCX_DSP_DFL_MAX_BATCH		= 32,
+	SCX_DSP_MAX_LOOPS		= 32,
 	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
 
 	SCX_EXIT_BT_LEN			= 64,
@@ -665,6 +666,7 @@ static struct kobject *scx_root_kobj;
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched_ext.h>
 
+static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
 					     s64 exit_code,
 					     const char *fmt, ...);
@@ -1906,6 +1908,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
+	int nr_loops = SCX_DSP_MAX_LOOPS;
 	bool has_tasks = false;
 
 	lockdep_assert_rq_held(rq);
@@ -1962,6 +1965,20 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 			goto has_tasks;
 		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
 			goto has_tasks;
+
+		/*
+		 * ops.dispatch() can trap us in this loop by repeatedly
+		 * dispatching ineligible tasks. Break out once in a while to
+		 * allow the watchdog to run. As IRQ can't be enabled in
+		 * balance(), we want to complete this scheduling cycle and then
+		 * start a new one. IOW, we want to call resched_curr() on the
+		 * next, most likely idle, task, not the current one. Use
+		 * scx_bpf_kick_cpu() for deferred kicking.
+		 */
+		if (unlikely(!--nr_loops)) {
+			scx_bpf_kick_cpu(cpu_of(rq), 0);
+			break;
+		}
 	} while (dspc->nr_tasks);
 
 	goto out;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 5b3da28bf042..879fc9c788e5 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -31,6 +31,7 @@ char _license[] SEC("license") = "GPL";
 const volatile u64 slice_ns = SCX_SLICE_DFL;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
+const volatile u32 dsp_inf_loop_after;
 const volatile u32 dsp_batch;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
@@ -198,6 +199,20 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	if (scx_bpf_consume(SHARED_DSQ))
 		return;
 
+	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+		/*
+		 * PID 2 should be kthreadd which should mostly be idle and off
+		 * the scheduler. Let's keep dispatching it to force the kernel
+		 * to call this function over and over again.
+		 */
+		p = bpf_task_from_pid(2);
+		if (p) {
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
+			bpf_task_release(p);
+			return;
+		}
+	}
+
 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
 		scx_bpf_error("failed to look up cpu_ctx");
 		return;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index a1123a17581b..594147a710a8 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -19,13 +19,14 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
 "       [-d PID] [-D LEN] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
+"  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
@@ -61,7 +62,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:d:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -75,6 +76,9 @@ int main(int argc, char **argv)
 		case 'T':
 			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
 			break;
+		case 'l':
+			skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
+			break;
 		case 'b':
 			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
 			break;
-- 
cgit v1.2.3


From 22a920209ab6aa4f8ec960ed81041643fddeaec6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:19 -1000
Subject: sched_ext: Implement tickless support

Allow BPF schedulers to indicate tickless operation by setting p->scx.slice
to SCX_SLICE_INF. A CPU whose current task has infinte slice goes into
tickless operation.

scx_central is updated to use tickless operations for all tasks and
instead use a BPF timer to expire slices. This also uses the SCX_ENQ_PREEMPT
and task state tracking added by the previous patches.

Currently, there is no way to pin the timer on the central CPU, so it may
end up on one of the worker CPUs; however, outside of that, the worker CPUs
can go tickless both while running sched_ext tasks and idling.

With schbench running, scx_central shows:

  root@test ~# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     142024        656        664        449   Local timer interrupts
  LOC:     161663        663        665        449   Local timer interrupts

Without it:

  root@test ~ [SIGINT]# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     188778       3142       3793       3993   Local timer interrupts
  LOC:     198993       5314       6323       6438   Local timer interrupts

While scx_central itself is too barebone to be useful as a
production scheduler, a more featureful central scheduler can be built using
the same approach. Google's experience shows that such an approach can have
significant benefits for certain applications such as VM hosting.

v4: Allow operation even if BPF_F_TIMER_CPU_PIN is not available.

v3: Pin the central scheduler's timer on the central_cpu using
    BPF_F_TIMER_CPU_PIN.

v2: Convert to BPF inline iterators.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h         |   1 +
 kernel/sched/core.c               |  11 ++-
 kernel/sched/ext.c                |  52 ++++++++++++-
 kernel/sched/ext.h                |   2 +
 kernel/sched/sched.h              |   1 +
 tools/sched_ext/scx_central.bpf.c | 159 ++++++++++++++++++++++++++++++++++++--
 tools/sched_ext/scx_central.c     |  29 ++++++-
 7 files changed, 242 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 3b2809b980ac..6f1a4977e9f8 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -16,6 +16,7 @@ enum scx_public_consts {
 	SCX_OPS_NAME_LEN	= 128,
 
 	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
+	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
 };
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1a3144c80af8..d5eff4036be7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1256,11 +1256,14 @@ bool sched_can_stop_tick(struct rq *rq)
 		return true;
 
 	/*
-	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
-	 * if there's more than one we need the tick for involuntary
-	 * preemption.
+	 * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+	 * left. For CFS, if there's more than one we need the tick for
+	 * involuntary preemption. For SCX, ask.
 	 */
-	if (rq->nr_running > 1)
+	if (!scx_switched_all() && rq->nr_running > 1)
+		return false;
+
+	if (scx_enabled() && !scx_can_stop_tick(rq))
 		return false;
 
 	/*
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2e652f7b8f54..ce32fc6b05cd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1086,7 +1086,8 @@ static void update_curr_scx(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 	cgroup_account_cputime(curr, delta_exec);
 
-	curr->scx.slice -= min(curr->scx.slice, delta_exec);
+	if (curr->scx.slice != SCX_SLICE_INF)
+		curr->scx.slice -= min(curr->scx.slice, delta_exec);
 }
 
 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
@@ -2093,6 +2094,28 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 		SCX_CALL_OP(SCX_KF_REST, running, p);
 
 	clr_task_runnable(p, true);
+
+	/*
+	 * @p is getting newly scheduled or got kicked after someone updated its
+	 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
+	 */
+	if ((p->scx.slice == SCX_SLICE_INF) !=
+	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+		if (p->scx.slice == SCX_SLICE_INF)
+			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+		else
+			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+		sched_update_tick_dependency(rq);
+
+		/*
+		 * For now, let's refresh the load_avgs just when transitioning
+		 * in and out of nohz. In the future, we might want to add a
+		 * mechanism which calls the following periodically on
+		 * tick-stopped CPUs.
+		 */
+		update_other_load_avgs(rq);
+	}
 }
 
 static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -2818,6 +2841,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
 	return 0;
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	if (scx_ops_bypassing())
+		return false;
+
+	if (p->sched_class != &ext_sched_class)
+		return true;
+
+	/*
+	 * @rq can dispatch from different DSQs, so we can't tell whether it
+	 * needs the tick or not by looking at nr_running. Allow stopping ticks
+	 * iff the BPF scheduler indicated so. See set_next_task_scx().
+	 */
+	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
 /*
  * Omitted operations:
  *
@@ -3120,6 +3163,9 @@ static void scx_ops_bypass(bool bypass)
 		}
 
 		rq_unlock_irqrestore(rq, &rf);
+
+		/* kick to restore ticks */
+		resched_cpu(cpu);
 	}
 }
 
@@ -4576,7 +4622,9 @@ __bpf_kfunc_start_defs();
  * BPF locks (in the future when BPF introduces more flexible locking).
  *
  * @p is allowed to run for @slice. The scheduling path is triggered on slice
- * exhaustion. If zero, the current residual slice is maintained.
+ * exhaustion. If zero, the current residual slice is maintained. If
+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
+ * scx_bpf_kick_cpu() to trigger scheduling.
  */
 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
 				  u64 enq_flags)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 33a9f7fe5832..6ed946f72489 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -35,6 +35,7 @@ void scx_pre_fork(struct task_struct *p);
 int scx_fork(struct task_struct *p);
 void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
+bool scx_can_stop_tick(struct rq *rq);
 int scx_check_setscheduler(struct task_struct *p, int policy);
 bool task_should_scx(struct task_struct *p);
 void init_sched_ext_class(void);
@@ -73,6 +74,7 @@ static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
 static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
 static inline bool task_on_scx(const struct task_struct *p) { return false; }
 static inline void init_sched_ext_class(void) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d9054eb4ba82..b3c578cb43cd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -727,6 +727,7 @@ struct cfs_rq {
 /* scx_rq->flags, protected by the rq lock */
 enum scx_rq_flags {
 	SCX_RQ_BALANCING	= 1 << 1,
+	SCX_RQ_CAN_STOP_TICK	= 1 << 2,
 };
 
 struct scx_rq {
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 428b2262faa3..1d8fd570eaa7 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -13,7 +13,26 @@
  *    through per-CPU BPF queues. The current design is chosen to maximally
  *    utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
  *
- * b. Preemption
+ * b. Tickless operation
+ *
+ *    All tasks are dispatched with the infinite slice which allows stopping the
+ *    ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
+ *    parameter. The tickless operation can be observed through
+ *    /proc/interrupts.
+ *
+ *    Periodic switching is enforced by a periodic timer checking all CPUs and
+ *    preempting them as necessary. Unfortunately, BPF timer currently doesn't
+ *    have a way to pin to a specific CPU, so the periodic timer isn't pinned to
+ *    the central CPU.
+ *
+ * c. Preemption
+ *
+ *    Kthreads are unconditionally queued to the head of a matching local dsq
+ *    and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
+ *    prioritized over user threads, which is required for ensuring forward
+ *    progress as e.g. the periodic timer may run on a ksoftirqd and if the
+ *    ksoftirqd gets starved by a user thread, there may not be anything else to
+ *    vacate that user thread.
  *
  *    SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
  *    next tasks.
@@ -32,14 +51,17 @@ char _license[] SEC("license") = "GPL";
 
 enum {
 	FALLBACK_DSQ_ID		= 0,
+	MS_TO_NS		= 1000LLU * 1000,
+	TIMER_INTERVAL_NS	= 1 * MS_TO_NS,
 };
 
 const volatile s32 central_cpu;
 const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
 const volatile u64 slice_ns = SCX_SLICE_DFL;
 
+bool timer_pinned = true;
 u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
-u64 nr_dispatches, nr_mismatches, nr_retries;
+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
 u64 nr_overflows;
 
 UEI_DEFINE(uei);
@@ -52,6 +74,23 @@ struct {
 
 /* can't use percpu map due to bad lookups */
 bool RESIZABLE_ARRAY(data, cpu_gimme_task);
+u64 RESIZABLE_ARRAY(data, cpu_started_at);
+
+struct central_timer {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct central_timer);
+} central_timer SEC(".maps");
+
+static bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
 
 s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -71,9 +110,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
 
 	__sync_fetch_and_add(&nr_total, 1);
 
+	/*
+	 * Push per-cpu kthreads at the head of local dsq's and preempt the
+	 * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
+	 * behind other threads which is necessary for forward progress
+	 * guarantee as we depend on the BPF timer which may run from ksoftirqd.
+	 */
+	if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+		__sync_fetch_and_add(&nr_locals, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+				 enq_flags | SCX_ENQ_PREEMPT);
+		return;
+	}
+
 	if (bpf_map_push_elem(&central_q, &pid, 0)) {
 		__sync_fetch_and_add(&nr_overflows, 1);
-		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
 		return;
 	}
 
@@ -106,7 +158,7 @@ static bool dispatch_to_cpu(s32 cpu)
 		 */
 		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 			__sync_fetch_and_add(&nr_mismatches, 1);
-			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
 			bpf_task_release(p);
 			/*
 			 * We might run out of dispatch buffer slots if we continue dispatching
@@ -120,7 +172,7 @@ static bool dispatch_to_cpu(s32 cpu)
 		}
 
 		/* dispatch to local and mark that @cpu doesn't need more */
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
 
 		if (cpu != central_cpu)
 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
@@ -188,9 +240,102 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
+{
+	s32 cpu = scx_bpf_task_cpu(p);
+	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+	if (started_at)
+		*started_at = bpf_ktime_get_ns() ?: 1;	/* 0 indicates idle */
+}
+
+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
+{
+	s32 cpu = scx_bpf_task_cpu(p);
+	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+	if (started_at)
+		*started_at = 0;
+}
+
+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+	u64 now = bpf_ktime_get_ns();
+	u64 nr_to_kick = nr_queued;
+	s32 i, curr_cpu;
+
+	curr_cpu = bpf_get_smp_processor_id();
+	if (timer_pinned && (curr_cpu != central_cpu)) {
+		scx_bpf_error("Central timer ran on CPU %d, not central CPU %d",
+			      curr_cpu, central_cpu);
+		return 0;
+	}
+
+	bpf_for(i, 0, nr_cpu_ids) {
+		s32 cpu = (nr_timers + i) % nr_cpu_ids;
+		u64 *started_at;
+
+		if (cpu == central_cpu)
+			continue;
+
+		/* kick iff the current one exhausted its slice */
+		started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+		if (started_at && *started_at &&
+		    vtime_before(now, *started_at + slice_ns))
+			continue;
+
+		/* and there's something pending */
+		if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
+		    scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
+			;
+		else if (nr_to_kick)
+			nr_to_kick--;
+		else
+			continue;
+
+		scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+	}
+
+	bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
+	__sync_fetch_and_add(&nr_timers, 1);
+	return 0;
+}
+
 int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 {
-	return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+	u32 key = 0;
+	struct bpf_timer *timer;
+	int ret;
+
+	ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+	if (ret)
+		return ret;
+
+	timer = bpf_map_lookup_elem(&central_timer, &key);
+	if (!timer)
+		return -ESRCH;
+
+	if (bpf_get_smp_processor_id() != central_cpu) {
+		scx_bpf_error("init from non-central CPU");
+		return -EINVAL;
+	}
+
+	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(timer, central_timerfn);
+
+	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
+	/*
+	 * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a
+	 * kernel which doesn't have it, bpf_timer_start() will return -EINVAL.
+	 * Retry without the PIN. This would be the perfect use case for
+	 * bpf_core_enum_value_exists() but the enum type doesn't have a name
+	 * and can't be used with bpf_core_enum_value_exists(). Oh well...
+	 */
+	if (ret == -EINVAL) {
+		timer_pinned = false;
+		ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	}
+	if (ret)
+		scx_bpf_error("bpf_timer_start failed (%d)", ret);
+	return ret;
 }
 
 void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
@@ -209,6 +354,8 @@ SCX_OPS_DEFINE(central_ops,
 	       .select_cpu		= (void *)central_select_cpu,
 	       .enqueue			= (void *)central_enqueue,
 	       .dispatch		= (void *)central_dispatch,
+	       .running			= (void *)central_running,
+	       .stopping		= (void *)central_stopping,
 	       .init			= (void *)central_init,
 	       .exit			= (void *)central_exit,
 	       .name			= "central");
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 5f09fc666a63..fb3f50886552 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -48,6 +48,7 @@ int main(int argc, char **argv)
 	struct bpf_link *link;
 	__u64 seq = 0;
 	__s32 opt;
+	cpu_set_t *cpuset;
 
 	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
@@ -77,10 +78,35 @@ int main(int argc, char **argv)
 
 	/* Resize arrays so their element count is equal to cpu count. */
 	RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
+	RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids);
 
 	SCX_OPS_LOAD(skel, central_ops, scx_central, uei);
+
+	/*
+	 * Affinitize the loading thread to the central CPU, as:
+	 * - That's where the BPF timer is first invoked in the BPF program.
+	 * - We probably don't want this user space component to take up a core
+	 *   from a task that would benefit from avoiding preemption on one of
+	 *   the tickless cores.
+	 *
+	 * Until BPF supports pinning the timer, it's not guaranteed that it
+	 * will always be invoked on the central CPU. In practice, this
+	 * suffices the majority of the time.
+	 */
+	cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
+	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
+	CPU_ZERO(cpuset);
+	CPU_SET(skel->rodata->central_cpu, cpuset);
+	SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
+		   "Failed to affinitize to central CPU %d (max %d)",
+		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
+	CPU_FREE(cpuset);
+
 	link = SCX_OPS_ATTACH(skel, central_ops, scx_central);
 
+	if (!skel->data->timer_pinned)
+		printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n");
+
 	while (!exit_req && !UEI_EXITED(skel, uei)) {
 		printf("[SEQ %llu]\n", seq++);
 		printf("total   :%10" PRIu64 "    local:%10" PRIu64 "   queued:%10" PRIu64 "  lost:%10" PRIu64 "\n",
@@ -88,7 +114,8 @@ int main(int argc, char **argv)
 		       skel->bss->nr_locals,
 		       skel->bss->nr_queued,
 		       skel->bss->nr_lost_pids);
-		printf("                    dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
+		printf("timer   :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
+		       skel->bss->nr_timers,
 		       skel->bss->nr_dispatches,
 		       skel->bss->nr_mismatches,
 		       skel->bss->nr_retries);
-- 
cgit v1.2.3


From 245254f7081dbe1c8da54675d0e4ddbe74cee61b Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Tue, 18 Jun 2024 10:09:20 -1000
Subject: sched_ext: Implement sched_ext_ops.cpu_acquire/release()

Scheduler classes are strictly ordered and when a higher priority class has
tasks to run, the lower priority ones lose access to the CPU. Being able to
monitor and act on these events are necessary for use cases includling
strict core-scheduling and latency management.

This patch adds two operations ops.cpu_acquire() and .cpu_release(). The
former is invoked when a CPU becomes available to the BPF scheduler and the
opposite for the latter. This patch also implements
scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger
requeueing of all tasks in the local dsq of the CPU so that the tasks can be
reassigned to other available CPUs.

scx_pair is updated to use .cpu_acquire/release() along with
%SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU
is preempted by a higher priority scheduler class.

scx_qmap is updated to use .cpu_acquire/release() to empty the local
dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers
that want to have a tight control over latency.

v4: Use the new SCX_KICK_IDLE to wake up a CPU after re-enqueueing.

v3: Drop the const qualifier from scx_cpu_release_args.task. BPF enforces
    access control through the verifier, so the qualifier isn't actually
    operative and only gets in the way when interacting with various
    helpers.

v2: Add p->scx.kf_mask annotation to allow calling scx_bpf_reenqueue_local()
    from ops.cpu_release() nested inside ops.init() and other sleepable
    operations.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h                |   4 +-
 kernel/sched/ext.c                       | 198 ++++++++++++++++++++++++++++++-
 kernel/sched/ext.h                       |   2 +
 kernel/sched/sched.h                     |   1 +
 tools/sched_ext/include/scx/common.bpf.h |   1 +
 tools/sched_ext/scx_qmap.bpf.c           |  37 +++++-
 tools/sched_ext/scx_qmap.c               |   4 +-
 7 files changed, 240 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 74341dbc6a19..21c627337e01 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -98,13 +98,15 @@ enum scx_kf_mask {
 	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
 	/* all non-sleepables may be nested inside SLEEPABLE */
 	SCX_KF_SLEEPABLE	= 1 << 0, /* sleepable init operations */
+	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
+	SCX_KF_CPU_RELEASE	= 1 << 1, /* ops.cpu_release() */
 	/* ops.dequeue (in REST) may be nested inside DISPATCH */
 	SCX_KF_DISPATCH		= 1 << 2, /* ops.dispatch() */
 	SCX_KF_ENQUEUE		= 1 << 3, /* ops.enqueue() and ops.select_cpu() */
 	SCX_KF_SELECT_CPU	= 1 << 4, /* ops.select_cpu() */
 	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
 
-	__SCX_KF_RQ_LOCKED	= SCX_KF_DISPATCH |
+	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
 				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1ca3067b4e0a..686dab6ab592 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -110,6 +110,32 @@ struct scx_exit_task_args {
 	bool cancelled;
 };
 
+enum scx_cpu_preempt_reason {
+	/* next task is being scheduled by &sched_class_rt */
+	SCX_CPU_PREEMPT_RT,
+	/* next task is being scheduled by &sched_class_dl */
+	SCX_CPU_PREEMPT_DL,
+	/* next task is being scheduled by &sched_class_stop */
+	SCX_CPU_PREEMPT_STOP,
+	/* unknown reason for SCX being preempted */
+	SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+	/* the reason the CPU was preempted */
+	enum scx_cpu_preempt_reason reason;
+
+	/* the task that's going to be scheduled on the CPU */
+	struct task_struct	*task;
+};
+
 /*
  * Informational context provided to dump operations.
  */
@@ -335,6 +361,28 @@ struct sched_ext_ops {
 	 */
 	void (*update_idle)(s32 cpu, bool idle);
 
+	/**
+	 * cpu_acquire - A CPU is becoming available to the BPF scheduler
+	 * @cpu: The CPU being acquired by the BPF scheduler.
+	 * @args: Acquire arguments, see the struct definition.
+	 *
+	 * A CPU that was previously released from the BPF scheduler is now once
+	 * again under its control.
+	 */
+	void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+	/**
+	 * cpu_release - A CPU is taken away from the BPF scheduler
+	 * @cpu: The CPU being released by the BPF scheduler.
+	 * @args: Release arguments, see the struct definition.
+	 *
+	 * The specified CPU is no longer under the control of the BPF
+	 * scheduler. This could be because it was preempted by a higher
+	 * priority sched_class, though there may be other reasons as well. The
+	 * caller should consult @args->reason to determine the cause.
+	 */
+	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
 	/**
 	 * init_task - Initialize a task to run in a BPF scheduler
 	 * @p: task to initialize for BPF scheduling
@@ -487,6 +535,17 @@ enum scx_enq_flags {
 	 */
 	SCX_ENQ_PREEMPT		= 1LLU << 32,
 
+	/*
+	 * The task being enqueued was previously enqueued on the current CPU's
+	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+	 * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+	 * invoked in a ->cpu_release() callback, and the task is again
+	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+	 * task will not be scheduled on the CPU until at least the next invocation
+	 * of the ->cpu_acquire() callback.
+	 */
+	SCX_ENQ_REENQ		= 1LLU << 40,
+
 	/*
 	 * The task being enqueued is the only task available for the cpu. By
 	 * default, ext core keeps executing such tasks but when
@@ -625,6 +684,7 @@ static bool scx_warned_zero_slice;
 
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 
 struct static_key_false scx_has_op[SCX_OPI_END] =
@@ -887,6 +947,12 @@ static __always_inline bool scx_kf_allowed(u32 mask)
 	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
 	 * boundary thanks to the above in_interrupt() check.
 	 */
+	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
+		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
+		scx_ops_error("cpu_release kfunc called from a nested operation");
+		return false;
+	}
+
 	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
 		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
 		scx_ops_error("dispatch kfunc called from a nested operation");
@@ -2070,6 +2136,19 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 	lockdep_assert_rq_held(rq);
 	rq->scx.flags |= SCX_RQ_BALANCING;
 
+	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+	    unlikely(rq->scx.cpu_released)) {
+		/*
+		 * If the previous sched_class for the current CPU was not SCX,
+		 * notify the BPF scheduler that it again has control of the
+		 * core. This callback complements ->cpu_release(), which is
+		 * emitted in scx_next_task_picked().
+		 */
+		if (SCX_HAS_OP(cpu_acquire))
+			SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
+		rq->scx.cpu_released = false;
+	}
+
 	if (prev_on_scx) {
 		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
 		update_curr_scx(rq);
@@ -2077,7 +2156,9 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 		/*
 		 * If @prev is runnable & has slice left, it has priority and
 		 * fetching more just increases latency for the fetched tasks.
-		 * Tell put_prev_task_scx() to put @prev on local_dsq.
+		 * Tell put_prev_task_scx() to put @prev on local_dsq. If the
+		 * BPF scheduler wants to handle this explicitly, it should
+		 * implement ->cpu_released().
 		 *
 		 * See scx_ops_disable_workfn() for the explanation on the
 		 * bypassing test.
@@ -2297,6 +2378,20 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
 	return p;
 }
 
+static enum scx_cpu_preempt_reason
+preempt_reason_from_class(const struct sched_class *class)
+{
+#ifdef CONFIG_SMP
+	if (class == &stop_sched_class)
+		return SCX_CPU_PREEMPT_STOP;
+#endif
+	if (class == &dl_sched_class)
+		return SCX_CPU_PREEMPT_DL;
+	if (class == &rt_sched_class)
+		return SCX_CPU_PREEMPT_RT;
+	return SCX_CPU_PREEMPT_UNKNOWN;
+}
+
 void scx_next_task_picked(struct rq *rq, struct task_struct *p,
 			  const struct sched_class *active)
 {
@@ -2312,6 +2407,40 @@ void scx_next_task_picked(struct rq *rq, struct task_struct *p,
 	 */
 	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
 #endif
+	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+		return;
+
+	/*
+	 * The callback is conceptually meant to convey that the CPU is no
+	 * longer under the control of SCX. Therefore, don't invoke the
+	 * callback if the CPU is is staying on SCX, or going idle (in which
+	 * case the SCX scheduler has actively decided not to schedule any
+	 * tasks on the CPU).
+	 */
+	if (likely(active >= &ext_sched_class))
+		return;
+
+	/*
+	 * At this point we know that SCX was preempted by a higher priority
+	 * sched_class, so invoke the ->cpu_release() callback if we have not
+	 * done so already. We only send the callback once between SCX being
+	 * preempted, and it regaining control of the CPU.
+	 *
+	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
+	 *  next time that balance_scx() is invoked.
+	 */
+	if (!rq->scx.cpu_released) {
+		if (SCX_HAS_OP(cpu_release)) {
+			struct scx_cpu_release_args args = {
+				.reason = preempt_reason_from_class(active),
+				.task = p,
+			};
+
+			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
+				    cpu_release, cpu_of(rq), &args);
+		}
+		rq->scx.cpu_released = true;
+	}
 }
 
 #ifdef CONFIG_SMP
@@ -3398,6 +3527,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		static_branch_disable_cpuslocked(&scx_has_op[i]);
 	static_branch_disable_cpuslocked(&scx_ops_enq_last);
 	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
+	static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
 	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	synchronize_rcu();
 
@@ -3699,9 +3829,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		seq_buf_init(&ns, buf, avail);
 
 		dump_newline(&ns);
-		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu pnt_seq=%lu",
+		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
 			  cpu, rq->scx.nr_running, rq->scx.flags,
-			  rq->scx.ops_qseq, rq->scx.pnt_seq);
+			  rq->scx.cpu_released, rq->scx.ops_qseq,
+			  rq->scx.pnt_seq);
 		dump_line(&ns, "          curr=%s[%d] class=%ps",
 			  rq->curr->comm, rq->curr->pid,
 			  rq->curr->sched_class);
@@ -3942,6 +4073,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	if (ops->flags & SCX_OPS_ENQ_EXITING)
 		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
+	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
+		static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
 
 	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
 		reset_idle_masks();
@@ -4318,6 +4451,8 @@ static bool yield_stub(struct task_struct *from, struct task_struct *to) { retur
 static void set_weight_stub(struct task_struct *p, u32 weight) {}
 static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
 static void update_idle_stub(s32 cpu, bool idle) {}
+static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {}
+static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {}
 static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
 static void enable_stub(struct task_struct *p) {}
@@ -4338,6 +4473,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 	.set_weight = set_weight_stub,
 	.set_cpumask = set_cpumask_stub,
 	.update_idle = update_idle_stub,
+	.cpu_acquire = cpu_acquire_stub,
+	.cpu_release = cpu_release_stub,
 	.init_task = init_task_stub,
 	.exit_task = exit_task_stub,
 	.enable = enable_stub,
@@ -4870,6 +5007,59 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
 
 __bpf_kfunc_start_defs();
 
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
+ * processed tasks. Can only be called from ops.cpu_release().
+ */
+__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+{
+	u32 nr_enqueued, i;
+	struct rq *rq;
+
+	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+		return 0;
+
+	rq = cpu_rq(smp_processor_id());
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * Get the number of tasks on the local DSQ before iterating over it to
+	 * pull off tasks. The enqueue callback below can signal that it wants
+	 * the task to stay on the local DSQ, and we want to prevent the BPF
+	 * scheduler from causing us to loop indefinitely.
+	 */
+	nr_enqueued = rq->scx.local_dsq.nr;
+	for (i = 0; i < nr_enqueued; i++) {
+		struct task_struct *p;
+
+		p = first_local_task(rq);
+		WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) !=
+			     SCX_OPSS_NONE);
+		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
+		WARN_ON_ONCE(p->scx.holding_cpu != -1);
+		dispatch_dequeue(rq, p);
+		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+	}
+
+	return nr_enqueued;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
+BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_cpu_release,
+};
+
+__bpf_kfunc_start_defs();
+
 /**
  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
  * @cpu: cpu to kick
@@ -5379,6 +5569,8 @@ static int __init scx_init(void)
 					     &scx_kfunc_set_enqueue_dispatch)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_dispatch)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_cpu_release)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_any)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 0aeb1fda1794..4ebd1c2478f1 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -24,6 +24,8 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
 #define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
 
+DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
+
 static inline bool task_on_scx(const struct task_struct *p)
 {
 	return scx_enabled() && p->sched_class == &ext_sched_class;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 734206e13897..147d18cf01ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,6 +737,7 @@ struct scx_rq {
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
 	u32			flags;
+	bool			cpu_released;
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_kick_if_idle;
 	cpumask_var_t		cpus_to_preempt;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 421118bc56ff..8686f84497db 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -34,6 +34,7 @@ void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flag
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
+u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 879fc9c788e5..4a87377558c8 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -11,6 +11,8 @@
  *
  * - BPF-side queueing using PIDs.
  * - Sleepable per-task storage allocation using ops.prep_enable().
+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking
+ *   the CPU away.
  *
  * This scheduler is primarily for demonstration and testing of sched_ext
  * features and unlikely to be useful for actual workloads.
@@ -90,7 +92,7 @@ struct {
 } cpu_ctx_stor SEC(".maps");
 
 /* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_dequeued;
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -164,6 +166,22 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/*
+	 * If the task was re-enqueued due to the CPU being preempted by a
+	 * higher priority scheduling class, just re-enqueue the task directly
+	 * on the global DSQ. As we want another CPU to pick it up, find and
+	 * kick an idle CPU.
+	 */
+	if (enq_flags & SCX_ENQ_REENQ) {
+		s32 cpu;
+
+		scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+		if (cpu >= 0)
+			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+		return;
+	}
+
 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
 	if (!ring) {
 		scx_bpf_error("failed to find ring %d", idx);
@@ -257,6 +275,22 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+	u32 cnt;
+
+	/*
+	 * Called when @cpu is taken by a higher priority scheduling class. This
+	 * makes @cpu no longer available for executing sched_ext tasks. As we
+	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
+	 * becomes available again, re-enqueue them into the global dsq. See
+	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
+	 */
+	cnt = scx_bpf_reenqueue_local();
+	if (cnt)
+		__sync_fetch_and_add(&nr_reenqueued, cnt);
+}
+
 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 		   struct scx_init_task_args *args)
 {
@@ -339,6 +373,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .enqueue			= (void *)qmap_enqueue,
 	       .dequeue			= (void *)qmap_dequeue,
 	       .dispatch		= (void *)qmap_dispatch,
+	       .cpu_release		= (void *)qmap_cpu_release,
 	       .init_task		= (void *)qmap_init_task,
 	       .dump			= (void *)qmap_dump,
 	       .dump_cpu		= (void *)qmap_dump_cpu,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 594147a710a8..2a97421afe9a 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -112,9 +112,9 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("stats  : enq=%lu dsp=%lu delta=%ld deq=%"PRIu64"\n",
+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64"\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-		       skel->bss->nr_dequeued);
+		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued);
 		fflush(stdout);
 		sleep(1);
 	}
-- 
cgit v1.2.3


From 60c27fb59f6cffa73fc8c60e3a22323c78044576 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:20 -1000
Subject: sched_ext: Implement sched_ext_ops.cpu_online/offline()

Add ops.cpu_online/offline() which are invoked when CPUs come online and
offline respectively. As the enqueue path already automatically bypasses
tasks to the local dsq on a deactivated CPU, BPF schedulers are guaranteed
to see tasks only on CPUs which are between online() and offline().

If the BPF scheduler doesn't implement ops.cpu_online/offline(), the
scheduler is automatically exited with SCX_ECODE_RESTART |
SCX_ECODE_RSN_HOTPLUG. Userspace can implement CPU hotpplug support
trivially by simply reinitializing and reloading the scheduler.

scx_qmap is updated to print out online CPUs on hotplug events. Other
schedulers are updated to restart based on ecode.

v3: - The previous implementation added @reason to
      sched_class.rq_on/offline() to distinguish between CPU hotplug events
      and topology updates. This was buggy and fragile as the methods are
      skipped if the current state equals the target state. Instead, add
      scx_rq_[de]activate() which are directly called from
      sched_cpu_de/activate(). This also allows ops.cpu_on/offline() to
      sleep which can be useful.

    - ops.dispatch() could be called on a CPU that the BPF scheduler was
      told to be offline. The dispatch patch is updated to bypass in such
      cases.

v2: - To accommodate lock ordering change between scx_cgroup_rwsem and
      cpus_read_lock(), CPU hotplug operations are put into its own SCX_OPI
      block and enabled eariler during scx_ope_enable() so that
      cpus_read_lock() can be dropped before acquiring scx_cgroup_rwsem.

    - Auto exit with ECODE added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c                          |   4 +
 kernel/sched/ext.c                           | 156 +++++++++++++++++++++++++--
 kernel/sched/ext.h                           |   4 +
 kernel/sched/sched.h                         |   6 ++
 tools/sched_ext/include/scx/compat.h         |  26 +++++
 tools/sched_ext/include/scx/user_exit_info.h |  28 +++++
 tools/sched_ext/scx_central.c                |   9 +-
 tools/sched_ext/scx_qmap.bpf.c               |  57 ++++++++++
 tools/sched_ext/scx_qmap.c                   |   4 +
 tools/sched_ext/scx_simple.c                 |   8 +-
 10 files changed, 290 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0e6ff33f34e4..c798c847d57e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7984,6 +7984,8 @@ int sched_cpu_activate(unsigned int cpu)
 		cpuset_cpu_active();
 	}
 
+	scx_rq_activate(rq);
+
 	/*
 	 * Put the rq online, if not already. This happens:
 	 *
@@ -8044,6 +8046,8 @@ int sched_cpu_deactivate(unsigned int cpu)
 	}
 	rq_unlock_irqrestore(rq, &rf);
 
+	scx_rq_deactivate(rq);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 686dab6ab592..7c2f2a542b32 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -30,6 +30,29 @@ enum scx_exit_kind {
 	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
 };
 
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or
+ * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
+ * respectively. The codes are 64bit of the format:
+ *
+ *   Bits: [63  ..  48 47   ..  32 31 .. 0]
+ *         [ SYS ACT ] [ SYS RSN ] [ USR  ]
+ *
+ *   SYS ACT: System-defined exit actions
+ *   SYS RSN: System-defined exit reasons
+ *   USR    : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+	/* Reasons */
+	SCX_ECODE_RSN_HOTPLUG	= 1LLU << 32,
+
+	/* Actions */
+	SCX_ECODE_ACT_RESTART	= 1LLU << 48,
+};
+
 /*
  * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
  * being disabled.
@@ -457,7 +480,29 @@ struct sched_ext_ops {
 	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
 
 	/*
-	 * All online ops must come before ops.init().
+	 * All online ops must come before ops.cpu_online().
+	 */
+
+	/**
+	 * cpu_online - A CPU became online
+	 * @cpu: CPU which just came up
+	 *
+	 * @cpu just came online. @cpu will not call ops.enqueue() or
+	 * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+	 */
+	void (*cpu_online)(s32 cpu);
+
+	/**
+	 * cpu_offline - A CPU is going offline
+	 * @cpu: CPU which is going offline
+	 *
+	 * @cpu is going offline. @cpu will not call ops.enqueue() or
+	 * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+	 */
+	void (*cpu_offline)(s32 cpu);
+
+	/*
+	 * All CPU hotplug ops must come before ops.init().
 	 */
 
 	/**
@@ -496,6 +541,15 @@ struct sched_ext_ops {
 	 */
 	u32 exit_dump_len;
 
+	/**
+	 * hotplug_seq - A sequence number that may be set by the scheduler to
+	 * detect when a hotplug event has occurred during the loading process.
+	 * If 0, no detection occurs. Otherwise, the scheduler will fail to
+	 * load if the sequence number does not match @scx_hotplug_seq on the
+	 * enable path.
+	 */
+	u64 hotplug_seq;
+
 	/**
 	 * name - BPF scheduler's name
 	 *
@@ -509,7 +563,9 @@ struct sched_ext_ops {
 enum scx_opi {
 	SCX_OPI_BEGIN			= 0,
 	SCX_OPI_NORMAL_BEGIN		= 0,
-	SCX_OPI_NORMAL_END		= SCX_OP_IDX(init),
+	SCX_OPI_NORMAL_END		= SCX_OP_IDX(cpu_online),
+	SCX_OPI_CPU_HOTPLUG_BEGIN	= SCX_OP_IDX(cpu_online),
+	SCX_OPI_CPU_HOTPLUG_END		= SCX_OP_IDX(init),
 	SCX_OPI_END			= SCX_OP_IDX(init),
 };
 
@@ -694,6 +750,7 @@ static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
 static struct scx_exit_info *scx_exit_info;
 
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 
 /*
  * The maximum amount of time in jiffies that a task may be runnable without
@@ -1419,11 +1476,7 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
 
 static bool scx_rq_online(struct rq *rq)
 {
-#ifdef CONFIG_SMP
-	return likely(rq->online);
-#else
-	return true;
-#endif
+	return likely(rq->scx.flags & SCX_RQ_ONLINE);
 }
 
 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
@@ -1438,6 +1491,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	if (sticky_cpu == cpu_of(rq))
 		goto local_norefill;
 
+	/*
+	 * If !scx_rq_online(), we already told the BPF scheduler that the CPU
+	 * is offline and are just running the hotplug path. Don't bother the
+	 * BPF scheduler.
+	 */
 	if (!scx_rq_online(rq))
 		goto local;
 
@@ -2673,6 +2731,42 @@ void __scx_update_idle(struct rq *rq, bool idle)
 #endif
 }
 
+static void handle_hotplug(struct rq *rq, bool online)
+{
+	int cpu = cpu_of(rq);
+
+	atomic_long_inc(&scx_hotplug_seq);
+
+	if (online && SCX_HAS_OP(cpu_online))
+		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu);
+	else if (!online && SCX_HAS_OP(cpu_offline))
+		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu);
+	else
+		scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+			     "cpu %d going %s, exiting scheduler", cpu,
+			     online ? "online" : "offline");
+}
+
+void scx_rq_activate(struct rq *rq)
+{
+	handle_hotplug(rq, true);
+}
+
+void scx_rq_deactivate(struct rq *rq)
+{
+	handle_hotplug(rq, false);
+}
+
+static void rq_online_scx(struct rq *rq)
+{
+	rq->scx.flags |= SCX_RQ_ONLINE;
+}
+
+static void rq_offline_scx(struct rq *rq)
+{
+	rq->scx.flags &= ~SCX_RQ_ONLINE;
+}
+
 #else	/* CONFIG_SMP */
 
 static bool test_and_clear_cpu_idle(int cpu) { return false; }
@@ -3104,6 +3198,9 @@ DEFINE_SCHED_CLASS(ext) = {
 	.balance		= balance_scx,
 	.select_task_rq		= select_task_rq_scx,
 	.set_cpus_allowed	= set_cpus_allowed_scx,
+
+	.rq_online		= rq_online_scx,
+	.rq_offline		= rq_offline_scx,
 #endif
 
 	.task_tick		= task_tick_scx,
@@ -3235,10 +3332,18 @@ static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
 }
 SCX_ATTR(nr_rejected);
 
+static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
+					 struct kobj_attribute *ka, char *buf)
+{
+	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
+}
+SCX_ATTR(hotplug_seq);
+
 static struct attribute *scx_global_attrs[] = {
 	&scx_attr_state.attr,
 	&scx_attr_switch_all.attr,
 	&scx_attr_nr_rejected.attr,
+	&scx_attr_hotplug_seq.attr,
 	NULL,
 };
 
@@ -3941,6 +4046,25 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
 	return helper;
 }
 
+static void check_hotplug_seq(const struct sched_ext_ops *ops)
+{
+	unsigned long long global_hotplug_seq;
+
+	/*
+	 * If a hotplug event has occurred between when a scheduler was
+	 * initialized, and when we were able to attach, exit and notify user
+	 * space about it.
+	 */
+	if (ops->hotplug_seq) {
+		global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
+		if (ops->hotplug_seq != global_hotplug_seq) {
+			scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+				     "expected hotplug seq %llu did not match actual %llu",
+				     ops->hotplug_seq, global_hotplug_seq);
+		}
+	}
+}
+
 static int validate_ops(const struct sched_ext_ops *ops)
 {
 	/*
@@ -4023,6 +4147,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		}
 	}
 
+	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
+		if (((void (**)(void))ops)[i])
+			static_branch_enable_cpuslocked(&scx_has_op[i]);
+
 	cpus_read_unlock();
 
 	ret = validate_ops(ops);
@@ -4064,6 +4192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	percpu_down_write(&scx_fork_rwsem);
 	cpus_read_lock();
 
+	check_hotplug_seq(ops);
+
 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 		if (((void (**)(void))ops)[i])
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
@@ -4374,6 +4504,9 @@ static int bpf_scx_init_member(const struct btf_type *t,
 		ops->exit_dump_len =
 			*(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
 		return 1;
+	case offsetof(struct sched_ext_ops, hotplug_seq):
+		ops->hotplug_seq = *(u64 *)(udata + moff);
+		return 1;
 	}
 
 	return 0;
@@ -4387,6 +4520,8 @@ static int bpf_scx_check_member(const struct btf_type *t,
 
 	switch (moff) {
 	case offsetof(struct sched_ext_ops, init_task):
+	case offsetof(struct sched_ext_ops, cpu_online):
+	case offsetof(struct sched_ext_ops, cpu_offline):
 	case offsetof(struct sched_ext_ops, init):
 	case offsetof(struct sched_ext_ops, exit):
 		break;
@@ -4457,6 +4592,8 @@ static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args
 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
 static void enable_stub(struct task_struct *p) {}
 static void disable_stub(struct task_struct *p) {}
+static void cpu_online_stub(s32 cpu) {}
+static void cpu_offline_stub(s32 cpu) {}
 static s32 init_stub(void) { return -EINVAL; }
 static void exit_stub(struct scx_exit_info *info) {}
 
@@ -4479,6 +4616,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 	.exit_task = exit_task_stub,
 	.enable = enable_stub,
 	.disable = disable_stub,
+	.cpu_online = cpu_online_stub,
+	.cpu_offline = cpu_offline_stub,
 	.init = init_stub,
 	.exit = exit_stub,
 };
@@ -4719,6 +4858,9 @@ void __init init_sched_ext_class(void)
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
+
+		if (cpu_online(cpu))
+			cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
 	}
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 4ebd1c2478f1..037f9acdf443 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -40,6 +40,8 @@ int scx_fork(struct task_struct *p);
 void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
 bool scx_can_stop_tick(struct rq *rq);
+void scx_rq_activate(struct rq *rq);
+void scx_rq_deactivate(struct rq *rq);
 int scx_check_setscheduler(struct task_struct *p, int policy);
 bool task_should_scx(struct task_struct *p);
 void init_sched_ext_class(void);
@@ -81,6 +83,8 @@ static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
 static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
+static inline void scx_rq_activate(struct rq *rq) {}
+static inline void scx_rq_deactivate(struct rq *rq) {}
 static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
 static inline bool task_on_scx(const struct task_struct *p) { return false; }
 static inline void init_sched_ext_class(void) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 147d18cf01ce..c0d6e42c99cc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -726,6 +726,12 @@ struct cfs_rq {
 #ifdef CONFIG_SCHED_CLASS_EXT
 /* scx_rq->flags, protected by the rq lock */
 enum scx_rq_flags {
+	/*
+	 * A hotplugged CPU starts scheduling before rq_online_scx(). Track
+	 * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called
+	 * only while the BPF scheduler considers the CPU to be online.
+	 */
+	SCX_RQ_ONLINE		= 1 << 0,
 	SCX_RQ_BALANCING	= 1 << 1,
 	SCX_RQ_CAN_STOP_TICK	= 1 << 2,
 };
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index c58024c980c8..cc56ff9aa252 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -8,6 +8,9 @@
 #define __SCX_COMPAT_H
 
 #include <bpf/btf.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
 
 struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
 
@@ -106,6 +109,28 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
 #define SCX_OPS_SWITCH_PARTIAL							\
 	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
 
+static inline long scx_hotplug_seq(void)
+{
+	int fd;
+	char buf[32];
+	ssize_t len;
+	long val;
+
+	fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
+	if (fd < 0)
+		return -ENOENT;
+
+	len = read(fd, buf, sizeof(buf) - 1);
+	SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
+	buf[len] = 0;
+	close(fd);
+
+	val = strtoul(buf, NULL, 10);
+	SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
+
+	return val;
+}
+
 /*
  * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
  * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
@@ -123,6 +148,7 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
 										\
 	__skel = __scx_name##__open();						\
 	SCX_BUG_ON(!__skel, "Could not open " #__scx_name);			\
+	__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq();		\
 	__skel; 								\
 })
 
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index c2ef85c645e1..891693ee604e 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -77,7 +77,35 @@ struct user_exit_info {
 	if (__uei->msg[0] != '\0')						\
 		fprintf(stderr, " (%s)", __uei->msg);				\
 	fputs("\n", stderr);							\
+	__uei->exit_code;							\
 })
 
+/*
+ * We can't import vmlinux.h while compiling user C code. Let's duplicate
+ * scx_exit_code definition.
+ */
+enum scx_exit_code {
+	/* Reasons */
+	SCX_ECODE_RSN_HOTPLUG		= 1LLU << 32,
+
+	/* Actions */
+	SCX_ECODE_ACT_RESTART		= 1LLU << 48,
+};
+
+enum uei_ecode_mask {
+	UEI_ECODE_USER_MASK		= ((1LLU << 32) - 1),
+	UEI_ECODE_SYS_RSN_MASK		= ((1LLU << 16) - 1) << 32,
+	UEI_ECODE_SYS_ACT_MASK		= ((1LLU << 16) - 1) << 48,
+};
+
+/*
+ * These macro interpret the ecode returned from UEI_REPORT().
+ */
+#define UEI_ECODE_USER(__ecode)		((__ecode) & UEI_ECODE_USER_MASK)
+#define UEI_ECODE_SYS_RSN(__ecode)	((__ecode) & UEI_ECODE_SYS_RSN_MASK)
+#define UEI_ECODE_SYS_ACT(__ecode)	((__ecode) & UEI_ECODE_SYS_ACT_MASK)
+
+#define UEI_ECODE_RESTART(__ecode)	(UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
+
 #endif	/* __bpf__ */
 #endif	/* __USER_EXIT_INFO_H */
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index fb3f50886552..21deea320bd7 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -46,14 +46,14 @@ int main(int argc, char **argv)
 {
 	struct scx_central *skel;
 	struct bpf_link *link;
-	__u64 seq = 0;
+	__u64 seq = 0, ecode;
 	__s32 opt;
 	cpu_set_t *cpuset;
 
 	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
-
+restart:
 	skel = SCX_OPS_OPEN(central_ops, scx_central);
 
 	skel->rodata->central_cpu = 0;
@@ -126,7 +126,10 @@ int main(int argc, char **argv)
 	}
 
 	bpf_link__destroy(link);
-	UEI_REPORT(skel, uei);
+	ecode = UEI_REPORT(skel, uei);
 	scx_central__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
 	return 0;
 }
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 4a87377558c8..619078355bf5 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -358,8 +358,63 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
 		     taskc->force_local);
 }
 
+/*
+ * Print out the online and possible CPU map using bpf_printk() as a
+ * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
+ */
+static void print_cpus(void)
+{
+	const struct cpumask *possible, *online;
+	s32 cpu;
+	char buf[128] = "", *p;
+	int idx;
+
+	possible = scx_bpf_get_possible_cpumask();
+	online = scx_bpf_get_online_cpumask();
+
+	idx = 0;
+	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
+		if (!(p = MEMBER_VPTR(buf, [idx++])))
+			break;
+		if (bpf_cpumask_test_cpu(cpu, online))
+			*p++ = 'O';
+		else if (bpf_cpumask_test_cpu(cpu, possible))
+			*p++ = 'X';
+		else
+			*p++ = ' ';
+
+		if ((cpu & 7) == 7) {
+			if (!(p = MEMBER_VPTR(buf, [idx++])))
+				break;
+			*p++ = '|';
+		}
+	}
+	buf[sizeof(buf) - 1] = '\0';
+
+	scx_bpf_put_cpumask(online);
+	scx_bpf_put_cpumask(possible);
+
+	bpf_printk("CPUS: |%s", buf);
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
+{
+	bpf_printk("CPU %d coming online", cpu);
+	/* @cpu is already online at this point */
+	print_cpus();
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
+{
+	bpf_printk("CPU %d going offline", cpu);
+	/* @cpu is still online at this point */
+	print_cpus();
+}
+
 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 {
+	print_cpus();
+
 	return scx_bpf_create_dsq(SHARED_DSQ, -1);
 }
 
@@ -378,6 +433,8 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .dump			= (void *)qmap_dump,
 	       .dump_cpu		= (void *)qmap_dump_cpu,
 	       .dump_task		= (void *)qmap_dump_task,
+	       .cpu_online		= (void *)qmap_cpu_online,
+	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
 	       .timeout_ms		= 5000U,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 2a97421afe9a..920fb54f9c77 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -122,5 +122,9 @@ int main(int argc, char **argv)
 	bpf_link__destroy(link);
 	UEI_REPORT(skel, uei);
 	scx_qmap__destroy(skel);
+	/*
+	 * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
+	 * on CPU hotplug events.
+	 */
 	return 0;
 }
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 7f500d1d56ac..bead482e1383 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -62,11 +62,12 @@ int main(int argc, char **argv)
 	struct scx_simple *skel;
 	struct bpf_link *link;
 	__u32 opt;
+	__u64 ecode;
 
 	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
-
+restart:
 	skel = SCX_OPS_OPEN(simple_ops, scx_simple);
 
 	while ((opt = getopt(argc, argv, "vh")) != -1) {
@@ -93,7 +94,10 @@ int main(int argc, char **argv)
 	}
 
 	bpf_link__destroy(link);
-	UEI_REPORT(skel, uei);
+	ecode = UEI_REPORT(skel, uei);
 	scx_simple__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
 	return 0;
 }
-- 
cgit v1.2.3


From 7b0888b7cc1924b74ce660e02f6211df8dd46e7b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:20 -1000
Subject: sched_ext: Implement core-sched support

The core-sched support is composed of the following parts:

- task_struct->scx.core_sched_at is added. This is a timestamp which can be
  used to order tasks. Depending on whether the BPF scheduler implements
  custom ordering, it tracks either global FIFO ordering of all tasks or
  local-DSQ ordering within the dispatched tasks on a CPU.

- prio_less() is updated to call scx_prio_less() when comparing SCX tasks.
  scx_prio_less() calls ops.core_sched_before() if available or uses the
  core_sched_at timestamp. For global FIFO ordering, the BPF scheduler
  doesn't need to do anything. Otherwise, it should implement
  ops.core_sched_before() which reflects the ordering.

- When core-sched is enabled, balance_scx() balances all SMT siblings so
  that they all have tasks dispatched if necessary before pick_task_scx() is
  called. pick_task_scx() picks between the current task and the first
  dispatched task on the local DSQ based on availability and the
  core_sched_at timestamps. Note that FIFO ordering is expected among the
  already dispatched tasks whether running or on the local DSQ, so this path
  always compares core_sched_at instead of calling into
  ops.core_sched_before().

qmap_core_sched_before() is added to scx_qmap. It scales the
distances from the heads of the queues to compare the tasks across different
priority queues and seems to behave as expected.

v3: Fixed build error when !CONFIG_SCHED_SMT reported by Andrea Righi.

v2: Sched core added the const qualifiers to prio_less task arguments.
    Explicitly drop them for ops.core_sched_before() task arguments. BPF
    enforces access control through the verifier, so the qualifier isn't
    actually operative and only gets in the way when interacting with
    various helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Reviewed-by: Josh Don <joshdon@google.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
---
 include/linux/sched/ext.h      |   3 +
 kernel/Kconfig.preempt         |   2 +-
 kernel/sched/core.c            |  10 +-
 kernel/sched/ext.c             | 250 +++++++++++++++++++++++++++++++++++++++--
 kernel/sched/ext.h             |   5 +
 tools/sched_ext/scx_qmap.bpf.c |  91 ++++++++++++++-
 tools/sched_ext/scx_qmap.c     |   5 +-
 7 files changed, 346 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 21c627337e01..3db7b32b2d1d 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -129,6 +129,9 @@ struct sched_ext_entity {
 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
 	unsigned long		runnable_at;
 
+#ifdef CONFIG_SCHED_CORE
+	u64			core_sched_at;	/* see scx_prio_less() */
+#endif
 	u64			ddsp_dsq_id;
 	u64			ddsp_enq_flags;
 
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 39ecfc2b5a1c..7dde5e424ac3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -135,7 +135,7 @@ config SCHED_CORE
 
 config SCHED_CLASS_EXT
 	bool "Extensible Scheduling Class"
-	depends on BPF_SYSCALL && BPF_JIT && !SCHED_CORE
+	depends on BPF_SYSCALL && BPF_JIT
 	help
 	  This option enables a new scheduler class sched_ext (SCX), which
 	  allows scheduling policies to be implemented as BPF programs to
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c798c847d57e..5eec6639773b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p)
 	if (p->sched_class == &idle_sched_class)
 		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
 
-	return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+	if (task_on_scx(p))
+		return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
+
+	return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
 }
 
 /*
@@ -198,6 +201,11 @@ static inline bool prio_less(const struct task_struct *a,
 	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
 		return cfs_prio_less(a, b, in_fi);
 
+#ifdef CONFIG_SCHED_CLASS_EXT
+	if (pa == MAX_RT_PRIO + MAX_NICE + 1)	/* ext */
+		return scx_prio_less(a, b, in_fi);
+#endif
+
 	return false;
 }
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 26616cd0c5df..1feb690be9d8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -344,6 +344,24 @@ struct sched_ext_ops {
 	 */
 	bool (*yield)(struct task_struct *from, struct task_struct *to);
 
+	/**
+	 * core_sched_before - Task ordering for core-sched
+	 * @a: task A
+	 * @b: task B
+	 *
+	 * Used by core-sched to determine the ordering between two tasks. See
+	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+	 * core-sched.
+	 *
+	 * Both @a and @b are runnable and may or may not currently be queued on
+	 * the BPF scheduler. Should return %true if @a should run before @b.
+	 * %false if there's no required ordering or @b should run before @a.
+	 *
+	 * If not specified, the default is ordering them according to when they
+	 * became runnable.
+	 */
+	bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
 	/**
 	 * set_weight - Set task weight
 	 * @p: task to set weight for
@@ -625,6 +643,14 @@ enum scx_enq_flags {
 enum scx_deq_flags {
 	/* expose select DEQUEUE_* flags as enums */
 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
+
+	/* high 32bits are SCX specific */
+
+	/*
+	 * The generic core-sched layer decided to execute the task even though
+	 * it hasn't been dispatched yet. Dequeue from the BPF side.
+	 */
+	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
 };
 
 enum scx_pick_idle_cpu_flags {
@@ -1260,6 +1286,49 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
 	return -EPROTO;
 }
 
+/**
+ * touch_core_sched - Update timestamp used for core-sched task ordering
+ * @rq: rq to read clock from, must be locked
+ * @p: task to update the timestamp for
+ *
+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called
+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice
+ * exhaustion).
+ */
+static void touch_core_sched(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_CORE
+	/*
+	 * It's okay to update the timestamp spuriously. Use
+	 * sched_core_disabled() which is cheaper than enabled().
+	 */
+	if (!sched_core_disabled())
+		p->scx.core_sched_at = rq_clock_task(rq);
+#endif
+}
+
+/**
+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
+ * @rq: rq to read clock from, must be locked
+ * @p: task being dispatched
+ *
+ * If the BPF scheduler implements custom core-sched ordering via
+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
+ * ordering within each local DSQ. This function is called from dispatch paths
+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
+ */
+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
+{
+	lockdep_assert_rq_held(rq);
+	assert_clock_updated(rq);
+
+#ifdef CONFIG_SCHED_CORE
+	if (SCX_HAS_OP(core_sched_before))
+		touch_core_sched(rq, p);
+#endif
+}
+
 static void update_curr_scx(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
@@ -1275,8 +1344,11 @@ static void update_curr_scx(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 	cgroup_account_cputime(curr, delta_exec);
 
-	if (curr->scx.slice != SCX_SLICE_INF)
+	if (curr->scx.slice != SCX_SLICE_INF) {
 		curr->scx.slice -= min(curr->scx.slice, delta_exec);
+		if (!curr->scx.slice)
+			touch_core_sched(rq, curr);
+	}
 }
 
 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
@@ -1469,6 +1541,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
 {
 	struct scx_dispatch_q *dsq;
 
+	touch_core_sched_dispatch(task_rq(p), p);
+
 	enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
 	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
 	dispatch_enqueue(dsq, p, enq_flags);
@@ -1550,12 +1624,19 @@ direct:
 	return;
 
 local:
+	/*
+	 * For task-ordering, slice refill must be treated as implying the end
+	 * of the current slice. Otherwise, the longer @p stays on the CPU, the
+	 * higher priority it becomes from scx_prio_less()'s POV.
+	 */
+	touch_core_sched(rq, p);
 	p->scx.slice = SCX_SLICE_DFL;
 local_norefill:
 	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
 	return;
 
 global:
+	touch_core_sched(rq, p);	/* see the comment in local: */
 	p->scx.slice = SCX_SLICE_DFL;
 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
 }
@@ -1619,6 +1700,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	if (SCX_HAS_OP(runnable))
 		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
 
+	if (enq_flags & SCX_ENQ_WAKEUP)
+		touch_core_sched(rq, p);
+
 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
 }
 
@@ -2106,6 +2190,7 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
 	struct scx_dispatch_q *dsq;
 	unsigned long opss;
 
+	touch_core_sched_dispatch(rq, p);
 retry:
 	/*
 	 * No need for _acquire here. @p is accessed only after a successful
@@ -2183,8 +2268,8 @@ static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
 	dspc->cursor = 0;
 }
 
-static int balance_scx(struct rq *rq, struct task_struct *prev,
-		       struct rq_flags *rf)
+static int balance_one(struct rq *rq, struct task_struct *prev,
+		       struct rq_flags *rf, bool local)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
@@ -2208,7 +2293,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 	}
 
 	if (prev_on_scx) {
-		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
+		WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP));
 		update_curr_scx(rq);
 
 		/*
@@ -2220,10 +2305,16 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 		 *
 		 * See scx_ops_disable_workfn() for the explanation on the
 		 * bypassing test.
+		 *
+		 * When balancing a remote CPU for core-sched, there won't be a
+		 * following put_prev_task_scx() call and we don't own
+		 * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the
+		 * same conditions later and pick @rq->curr accordingly.
 		 */
 		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
 		    prev->scx.slice && !scx_ops_bypassing()) {
-			prev->scx.flags |= SCX_TASK_BAL_KEEP;
+			if (local)
+				prev->scx.flags |= SCX_TASK_BAL_KEEP;
 			goto has_tasks;
 		}
 	}
@@ -2285,10 +2376,56 @@ out:
 	return has_tasks;
 }
 
+static int balance_scx(struct rq *rq, struct task_struct *prev,
+		       struct rq_flags *rf)
+{
+	int ret;
+
+	ret = balance_one(rq, prev, rf, true);
+
+#ifdef CONFIG_SCHED_SMT
+	/*
+	 * When core-sched is enabled, this ops.balance() call will be followed
+	 * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx()
+	 * on the SMT siblings. Balance the siblings too.
+	 */
+	if (sched_core_enabled(rq)) {
+		const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+		int scpu;
+
+		for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
+			struct rq *srq = cpu_rq(scpu);
+			struct rq_flags srf;
+			struct task_struct *sprev = srq->curr;
+
+			/*
+			 * While core-scheduling, rq lock is shared among
+			 * siblings but the debug annotations and rq clock
+			 * aren't. Do pinning dance to transfer the ownership.
+			 */
+			WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
+			rq_unpin_lock(rq, rf);
+			rq_pin_lock(srq, &srf);
+
+			update_rq_clock(srq);
+			balance_one(srq, sprev, &srf, false);
+
+			rq_unpin_lock(srq, &srf);
+			rq_repin_lock(rq, rf);
+		}
+	}
+#endif
+	return ret;
+}
+
 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 {
 	if (p->scx.flags & SCX_TASK_QUEUED) {
-		WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+		/*
+		 * Core-sched might decide to execute @p before it is
+		 * dispatched. Call ops_dequeue() to notify the BPF scheduler.
+		 */
+		ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
 		dispatch_dequeue(rq, p);
 	}
 
@@ -2379,7 +2516,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 		/*
 		 * If @p has slice left and balance_scx() didn't tag it for
 		 * keeping, @p is getting preempted by a higher priority
-		 * scheduler class. Leave it at the head of the local DSQ.
+		 * scheduler class or core-sched forcing a different task. Leave
+		 * it at the head of the local DSQ.
 		 */
 		if (p->scx.slice && !scx_ops_bypassing()) {
 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
@@ -2436,6 +2574,84 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
 	return p;
 }
 
+#ifdef CONFIG_SCHED_CORE
+/**
+ * scx_prio_less - Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Core-sched is implemented as an additional scheduling layer on top of the
+ * usual sched_class'es and needs to find out the expected task ordering. For
+ * SCX, core-sched calls this function to interrogate the task ordering.
+ *
+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
+ * to implement the default task ordering. The older the timestamp, the higher
+ * prority the task - the global FIFO ordering matching the default scheduling
+ * behavior.
+ *
+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
+ * implement FIFO ordering within each local DSQ. See pick_task_scx().
+ */
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+		   bool in_fi)
+{
+	/*
+	 * The const qualifiers are dropped from task_struct pointers when
+	 * calling ops.core_sched_before(). Accesses are controlled by the
+	 * verifier.
+	 */
+	if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+					      (struct task_struct *)a,
+					      (struct task_struct *)b);
+	else
+		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
+}
+
+/**
+ * pick_task_scx - Pick a candidate task for core-sched
+ * @rq: rq to pick the candidate task from
+ *
+ * Core-sched calls this function on each SMT sibling to determine the next
+ * tasks to run on the SMT siblings. balance_one() has been called on all
+ * siblings and put_prev_task_scx() has been called only for the current CPU.
+ *
+ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look
+ * at the first task in the local dsq. @rq->curr has to be considered explicitly
+ * to mimic %SCX_TASK_BAL_KEEP.
+ */
+static struct task_struct *pick_task_scx(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct task_struct *first = first_local_task(rq);
+
+	if (curr->scx.flags & SCX_TASK_QUEUED) {
+		/* is curr the only runnable task? */
+		if (!first)
+			return curr;
+
+		/*
+		 * Does curr trump first? We can always go by core_sched_at for
+		 * this comparison as it represents global FIFO ordering when
+		 * the default core-sched ordering is used and local-DSQ FIFO
+		 * ordering otherwise.
+		 *
+		 * We can have a task with an earlier timestamp on the DSQ. For
+		 * example, when a current task is preempted by a sibling
+		 * picking a different cookie, the task would be requeued at the
+		 * head of the local DSQ with an earlier timestamp than the
+		 * core-sched picked next task. Besides, the BPF scheduler may
+		 * dispatch any tasks to the local DSQ anytime.
+		 */
+		if (curr->scx.slice && time_before64(curr->scx.core_sched_at,
+						     first->scx.core_sched_at))
+			return curr;
+	}
+
+	return first;	/* this may be %NULL */
+}
+#endif	/* CONFIG_SCHED_CORE */
+
 static enum scx_cpu_preempt_reason
 preempt_reason_from_class(const struct sched_class *class)
 {
@@ -2843,13 +3059,15 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 	update_curr_scx(rq);
 
 	/*
-	 * While bypassing, always resched as we can't trust the slice
-	 * management.
+	 * While disabling, always resched and refresh core-sched timestamp as
+	 * we can't trust the slice management or ops.core_sched_before().
 	 */
-	if (scx_ops_bypassing())
+	if (scx_ops_bypassing()) {
 		curr->scx.slice = 0;
-	else if (SCX_HAS_OP(tick))
+		touch_core_sched(rq, curr);
+	} else if (SCX_HAS_OP(tick)) {
 		SCX_CALL_OP(SCX_KF_REST, tick, curr);
+	}
 
 	if (!curr->scx.slice)
 		resched_curr(rq);
@@ -3203,6 +3421,10 @@ DEFINE_SCHED_CLASS(ext) = {
 	.rq_offline		= rq_offline_scx,
 #endif
 
+#ifdef CONFIG_SCHED_CORE
+	.pick_task		= pick_task_scx,
+#endif
+
 	.task_tick		= task_tick_scx,
 
 	.switching_to		= switching_to_scx,
@@ -3416,12 +3638,14 @@ bool task_should_scx(struct task_struct *p)
  *
  * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
  *    trusted. Whenever a tick triggers, the running task is rotated to the tail
- *    of the queue.
+ *    of the queue with core_sched_at touched.
  *
  * d. pick_next_task() suppresses zero slice warning.
  *
  * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
  *    operations.
+ *
+ * f. scx_prio_less() reverts to the default core_sched_at order.
  */
 static void scx_ops_bypass(bool bypass)
 {
@@ -4583,6 +4807,7 @@ static void running_stub(struct task_struct *p) {}
 static void stopping_stub(struct task_struct *p, bool runnable) {}
 static void quiescent_stub(struct task_struct *p, u64 deq_flags) {}
 static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; }
+static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; }
 static void set_weight_stub(struct task_struct *p, u32 weight) {}
 static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
 static void update_idle_stub(s32 cpu, bool idle) {}
@@ -4607,6 +4832,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 	.stopping = stopping_stub,
 	.quiescent = quiescent_stub,
 	.yield = yield_stub,
+	.core_sched_before = core_sched_before_stub,
 	.set_weight = set_weight_stub,
 	.set_cpumask = set_cpumask_stub,
 	.update_idle = update_idle_stub,
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 037f9acdf443..6555878c5da3 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -70,6 +70,11 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
 	for_active_class_range(class, (prev_class) > &ext_sched_class ?		\
 			       &ext_sched_class : (prev_class), (end_class))
 
+#ifdef CONFIG_SCHED_CORE
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+		   bool in_fi);
+#endif
+
 #else	/* CONFIG_SCHED_CLASS_EXT */
 
 #define scx_enabled()		false
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 619078355bf5..c75c70d6a8eb 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -13,6 +13,7 @@
  * - Sleepable per-task storage allocation using ops.prep_enable().
  * - Using ops.cpu_release() to handle a higher priority scheduling class taking
  *   the CPU away.
+ * - Core-sched support.
  *
  * This scheduler is primarily for demonstration and testing of sched_ext
  * features and unlikely to be useful for actual workloads.
@@ -67,9 +68,21 @@ struct {
 	},
 };
 
+/*
+ * Per-queue sequence numbers to implement core-sched ordering.
+ *
+ * Tail seq is assigned to each queued task and incremented. Head seq tracks the
+ * sequence number of the latest dispatched task. The distance between the a
+ * task's seq and the associated queue's head seq is called the queue distance
+ * and used when comparing two tasks for ordering. See qmap_core_sched_before().
+ */
+static u64 core_sched_head_seqs[5];
+static u64 core_sched_tail_seqs[5];
+
 /* Per-task scheduling context */
 struct task_ctx {
 	bool	force_local;	/* Dispatch directly to local_dsq */
+	u64	core_sched_seq;
 };
 
 struct {
@@ -93,6 +106,7 @@ struct {
 
 /* Statistics */
 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
+u64 nr_core_sched_execed;
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -159,8 +173,18 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
-	/* Is select_cpu() is telling us to enqueue locally? */
-	if (tctx->force_local) {
+	/*
+	 * All enqueued tasks must have their core_sched_seq updated for correct
+	 * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
+	 * qmap_ops.flags.
+	 */
+	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+
+	/*
+	 * If qmap_select_cpu() is telling us to or this is the last runnable
+	 * task on the CPU, enqueue locally.
+	 */
+	if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
 		tctx->force_local = false;
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
@@ -204,6 +228,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 {
 	__sync_fetch_and_add(&nr_dequeued, 1);
+	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
+		__sync_fetch_and_add(&nr_core_sched_execed, 1);
+}
+
+static void update_core_sched_head_seq(struct task_struct *p)
+{
+	struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	int idx = weight_to_idx(p->scx.weight);
+
+	if (tctx)
+		core_sched_head_seqs[idx] = tctx->core_sched_seq;
+	else
+		scx_bpf_error("task_ctx lookup failed");
 }
 
 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
@@ -258,6 +295,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!p)
 				continue;
 
+			update_core_sched_head_seq(p);
 			__sync_fetch_and_add(&nr_dispatched, 1);
 			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
 			bpf_task_release(p);
@@ -275,6 +313,49 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+/*
+ * The distance from the head of the queue scaled by the weight of the queue.
+ * The lower the number, the older the task and the higher the priority.
+ */
+static s64 task_qdist(struct task_struct *p)
+{
+	int idx = weight_to_idx(p->scx.weight);
+	struct task_ctx *tctx;
+	s64 qdist;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return 0;
+	}
+
+	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+
+	/*
+	 * As queue index increments, the priority doubles. The queue w/ index 3
+	 * is dispatched twice more frequently than 2. Reflect the difference by
+	 * scaling qdists accordingly. Note that the shift amount needs to be
+	 * flipped depending on the sign to avoid flipping priority direction.
+	 */
+	if (qdist >= 0)
+		return qdist << (4 - idx);
+	else
+		return qdist << idx;
+}
+
+/*
+ * This is called to determine the task ordering when core-sched is picking
+ * tasks to execute on SMT siblings and should encode about the same ordering as
+ * the regular scheduling path. Use the priority-scaled distances from the head
+ * of the queues to compare the two tasks which should be consistent with the
+ * dispatch path behavior.
+ */
+bool BPF_STRUCT_OPS(qmap_core_sched_before,
+		    struct task_struct *a, struct task_struct *b)
+{
+	return task_qdist(a) > task_qdist(b);
+}
+
 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
 {
 	u32 cnt;
@@ -354,8 +435,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
 	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
 		return;
 
-	scx_bpf_dump("QMAP: force_local=%d",
-		     taskc->force_local);
+	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
+		     taskc->force_local, taskc->core_sched_seq);
 }
 
 /*
@@ -428,6 +509,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .enqueue			= (void *)qmap_enqueue,
 	       .dequeue			= (void *)qmap_dequeue,
 	       .dispatch		= (void *)qmap_dispatch,
+	       .core_sched_before	= (void *)qmap_core_sched_before,
 	       .cpu_release		= (void *)qmap_cpu_release,
 	       .init_task		= (void *)qmap_init_task,
 	       .dump			= (void *)qmap_dump,
@@ -437,5 +519,6 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
+	       .flags			= SCX_OPS_ENQ_LAST,
 	       .timeout_ms		= 5000U,
 	       .name			= "qmap");
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 920fb54f9c77..bc36ec4f88a7 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -112,9 +112,10 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64"\n",
+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64"\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued);
+		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
+		       skel->bss->nr_core_sched_execed);
 		fflush(stdout);
 		sleep(1);
 	}
-- 
cgit v1.2.3


From 06e51be3d5e7a07aea5c9012773df8d5de01db6c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:21 -1000
Subject: sched_ext: Add vtime-ordered priority queue to dispatch_q's

Currently, a dsq is always a FIFO. A task which is dispatched earlier gets
consumed or executed earlier. While this is sufficient when dsq's are used
for simple staging areas for tasks which are ready to execute, it'd make
dsq's a lot more useful if they can implement custom ordering.

This patch adds a vtime-ordered priority queue to dsq's. When the BPF
scheduler dispatches a task with the new scx_bpf_dispatch_vtime() helper, it
can specify the vtime tha the task should be inserted at and the task is
inserted into the priority queue in the dsq which is ordered according to
time_before64() comparison of the vtime values.

A DSQ can either be a FIFO or priority queue and automatically switches
between the two depending on whether scx_bpf_dispatch() or
scx_bpf_dispatch_vtime() is used. Using the wrong variant while the DSQ
already has the other type queued is not allowed and triggers an ops error.
Built-in DSQs must always be FIFOs.

This makes it very easy for the BPF schedulers to implement proper vtime
based scheduling within each dsq very easy and efficient at a negligible
cost in terms of code complexity and overhead.

scx_simple and scx_example_flatcg are updated to default to weighted
vtime scheduling (the latter within each cgroup). FIFO scheduling can be
selected with -f option.

v4: - As allowing mixing priority queue and FIFO on the same DSQ sometimes
      led to unexpected starvations, DSQs now error out if both modes are
      used at the same time and the built-in DSQs are no longer allowed to
      be priority queues.

    - Explicit type struct scx_dsq_node added to contain fields needed to be
      linked on DSQs. This will be used to implement stateful iterator.

    - Tasks are now always linked on dsq->list whether the DSQ is in FIFO or
      PRIQ mode. This confines PRIQ related complexities to the enqueue and
      dequeue paths. Other paths only need to look at dsq->list. This will
      also ease implementing BPF iterator.

    - Print p->scx.dsq_flags in debug dump.

v3: - SCX_TASK_DSQ_ON_PRIQ flag is moved from p->scx.flags into its own
      p->scx.dsq_flags. The flag is protected with the dsq lock unlike other
      flags in p->scx.flags. This led to flag corruption in some cases.

    - Add comments explaining the interaction between using consumption of
      p->scx.slice to determine vtime progress and yielding.

v2: - p->scx.dsq_vtime was not initialized on load or across cgroup
      migrations leading to some tasks being stalled for extended period of
      time depending on how saturated the machine is. Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 include/linux/sched/ext.h                |  29 +++++-
 init/init_task.c                         |   2 +-
 kernel/sched/ext.c                       | 156 +++++++++++++++++++++++++++----
 tools/sched_ext/include/scx/common.bpf.h |   1 +
 tools/sched_ext/scx_simple.bpf.c         |  97 ++++++++++++++++++-
 tools/sched_ext/scx_simple.c             |   8 +-
 6 files changed, 267 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 3db7b32b2d1d..9cee193dab19 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -49,12 +49,15 @@ enum scx_dsq_id_flags {
 };
 
 /*
- * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the
- * scheduler core and the BPF scheduler. See the documentation for more details.
+ * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
+ * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
+ * buffer between the scheduler core and the BPF scheduler. See the
+ * documentation for more details.
  */
 struct scx_dispatch_q {
 	raw_spinlock_t		lock;
 	struct list_head	list;	/* tasks in dispatch order */
+	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
 	u32			nr;
 	u64			id;
 	struct rhash_head	hash_node;
@@ -86,6 +89,11 @@ enum scx_task_state {
 	SCX_TASK_NR_STATES,
 };
 
+/* scx_entity.dsq_flags */
+enum scx_ent_dsq_flags {
+	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
+};
+
 /*
  * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
  * everywhere and the following bits track which kfunc sets are currently
@@ -111,13 +119,19 @@ enum scx_kf_mask {
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
 
+struct scx_dsq_node {
+	struct list_head	list;		/* dispatch order */
+	struct rb_node		priq;		/* p->scx.dsq_vtime order */
+	u32			flags;		/* SCX_TASK_DSQ_* flags */
+};
+
 /*
  * The following is embedded in task_struct and contains all fields necessary
  * for a task to be scheduled by SCX.
  */
 struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
-	struct list_head	dsq_node;
+	struct scx_dsq_node	dsq_node;	/* protected by dsq lock */
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
 	s32			sticky_cpu;
@@ -149,6 +163,15 @@ struct sched_ext_entity {
 	 */
 	u64			slice;
 
+	/*
+	 * Used to order tasks when dispatching to the vtime-ordered priority
+	 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
+	 * but can also be modified directly by the BPF scheduler. Modifying it
+	 * while a task is queued on a dsq may mangle the ordering and is not
+	 * recommended.
+	 */
+	u64			dsq_vtime;
+
 	/*
 	 * If set, reject future sched_setscheduler(2) calls updating the policy
 	 * to %SCHED_EXT with -%EACCES.
diff --git a/init/init_task.c b/init/init_task.c
index 8a44c932d10f..5726b3a0eea9 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -102,7 +102,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 #endif
 #ifdef CONFIG_SCHED_CLASS_EXT
 	.scx		= {
-		.dsq_node	= LIST_HEAD_INIT(init_task.scx.dsq_node),
+		.dsq_node.list	= LIST_HEAD_INIT(init_task.scx.dsq_node.list),
 		.sticky_cpu	= -1,
 		.holding_cpu	= -1,
 		.runnable_node	= LIST_HEAD_INIT(init_task.scx.runnable_node),
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1feb690be9d8..f186c576e7d9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -638,6 +638,7 @@ enum scx_enq_flags {
 	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
 
 	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
+	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
 };
 
 enum scx_deq_flags {
@@ -1351,6 +1352,17 @@ static void update_curr_scx(struct rq *rq)
 	}
 }
 
+static bool scx_dsq_priq_less(struct rb_node *node_a,
+			      const struct rb_node *node_b)
+{
+	const struct task_struct *a =
+		container_of(node_a, struct task_struct, scx.dsq_node.priq);
+	const struct task_struct *b =
+		container_of(node_b, struct task_struct, scx.dsq_node.priq);
+
+	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
+}
+
 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
 {
 	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
@@ -1362,7 +1374,9 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 {
 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
 
-	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node));
+	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.list));
+	WARN_ON_ONCE((p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) ||
+		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
 
 	if (!is_local) {
 		raw_spin_lock(&dsq->lock);
@@ -1375,10 +1389,59 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 		}
 	}
 
-	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
-		list_add(&p->scx.dsq_node, &dsq->list);
-	else
-		list_add_tail(&p->scx.dsq_node, &dsq->list);
+	if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
+		     (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
+		/*
+		 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
+		 * their FIFO queues. To avoid confusion and accidentally
+		 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
+		 * disallow any internal DSQ from doing vtime ordering of
+		 * tasks.
+		 */
+		scx_ops_error("cannot use vtime ordering for built-in DSQs");
+		enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
+	}
+
+	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
+		struct rb_node *rbp;
+
+		/*
+		 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are
+		 * linked to both the rbtree and list on PRIQs, this can only be
+		 * tested easily when adding the first task.
+		 */
+		if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
+			     !list_empty(&dsq->list)))
+			scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+				      dsq->id);
+
+		p->scx.dsq_node.flags |= SCX_TASK_DSQ_ON_PRIQ;
+		rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less);
+
+		/*
+		 * Find the previous task and insert after it on the list so
+		 * that @dsq->list is vtime ordered.
+		 */
+		rbp = rb_prev(&p->scx.dsq_node.priq);
+		if (rbp) {
+			struct task_struct *prev =
+				container_of(rbp, struct task_struct,
+					     scx.dsq_node.priq);
+			list_add(&p->scx.dsq_node.list, &prev->scx.dsq_node.list);
+		} else {
+			list_add(&p->scx.dsq_node.list, &dsq->list);
+		}
+	} else {
+		/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
+		if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
+			scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+				      dsq->id);
+
+		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+			list_add(&p->scx.dsq_node.list, &dsq->list);
+		else
+			list_add_tail(&p->scx.dsq_node.list, &dsq->list);
+	}
 
 	dsq_mod_nr(dsq, 1);
 	p->scx.dsq = dsq;
@@ -1417,13 +1480,30 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	}
 }
 
+static void task_unlink_from_dsq(struct task_struct *p,
+				 struct scx_dispatch_q *dsq)
+{
+	if (p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) {
+		rb_erase(&p->scx.dsq_node.priq, &dsq->priq);
+		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
+		p->scx.dsq_node.flags &= ~SCX_TASK_DSQ_ON_PRIQ;
+	}
+
+	list_del_init(&p->scx.dsq_node.list);
+}
+
+static bool task_linked_on_dsq(struct task_struct *p)
+{
+	return !list_empty(&p->scx.dsq_node.list);
+}
+
 static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
 {
 	struct scx_dispatch_q *dsq = p->scx.dsq;
 	bool is_local = dsq == &rq->scx.local_dsq;
 
 	if (!dsq) {
-		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
+		WARN_ON_ONCE(task_linked_on_dsq(p));
 		/*
 		 * When dispatching directly from the BPF scheduler to a local
 		 * DSQ, the task isn't associated with any DSQ but
@@ -1444,8 +1524,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
 	*/
 	if (p->scx.holding_cpu < 0) {
 		/* @p must still be on @dsq, dequeue */
-		WARN_ON_ONCE(list_empty(&p->scx.dsq_node));
-		list_del_init(&p->scx.dsq_node);
+		WARN_ON_ONCE(!task_linked_on_dsq(p));
+		task_unlink_from_dsq(p, dsq);
 		dsq_mod_nr(dsq, -1);
 	} else {
 		/*
@@ -1454,7 +1534,7 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
 		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
 		 * the race.
 		 */
-		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
+		WARN_ON_ONCE(task_linked_on_dsq(p));
 		p->scx.holding_cpu = -1;
 	}
 	p->scx.dsq = NULL;
@@ -1949,7 +2029,8 @@ static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq,
 
 	/* @dsq is locked and @p is on this rq */
 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-	list_move_tail(&p->scx.dsq_node, &rq->scx.local_dsq.list);
+	task_unlink_from_dsq(p, dsq);
+	list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list);
 	dsq_mod_nr(dsq, -1);
 	dsq_mod_nr(&rq->scx.local_dsq, 1);
 	p->scx.dsq = &rq->scx.local_dsq;
@@ -1992,7 +2073,7 @@ static bool consume_remote_task(struct rq *rq, struct rq_flags *rf,
 	 * move_task_to_local_dsq().
 	 */
 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-	list_del_init(&p->scx.dsq_node);
+	task_unlink_from_dsq(p, dsq);
 	dsq_mod_nr(dsq, -1);
 	p->scx.holding_cpu = raw_smp_processor_id();
 	raw_spin_unlock(&dsq->lock);
@@ -2024,7 +2105,7 @@ retry:
 
 	raw_spin_lock(&dsq->lock);
 
-	list_for_each_entry(p, &dsq->list, scx.dsq_node) {
+	list_for_each_entry(p, &dsq->list, scx.dsq_node.list) {
 		struct rq *task_rq = task_rq(p);
 
 		if (rq == task_rq) {
@@ -2543,7 +2624,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 static struct task_struct *first_local_task(struct rq *rq)
 {
 	return list_first_entry_or_null(&rq->scx.local_dsq.list,
-					struct task_struct, scx.dsq_node);
+					struct task_struct, scx.dsq_node.list);
 }
 
 static struct task_struct *pick_next_task_scx(struct rq *rq)
@@ -3225,7 +3306,8 @@ void init_scx_entity(struct sched_ext_entity *scx)
 	 */
 	memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
 
-	INIT_LIST_HEAD(&scx->dsq_node);
+	INIT_LIST_HEAD(&scx->dsq_node.list);
+	RB_CLEAR_NODE(&scx->dsq_node.priq);
 	scx->sticky_cpu = -1;
 	scx->holding_cpu = -1;
 	INIT_LIST_HEAD(&scx->runnable_node);
@@ -4070,12 +4152,13 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
 	dump_line(s, " %c%c %s[%d] %+ldms",
 		  marker, task_state_to_char(p), p->comm, p->pid,
 		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
-	dump_line(s, "      scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu",
+	dump_line(s, "      scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
 		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
-		  ops_state & SCX_OPSS_STATE_MASK,
+		  p->scx.dsq_node.flags, ops_state & SCX_OPSS_STATE_MASK,
 		  ops_state >> SCX_OPSS_QSEQ_SHIFT);
-	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s",
-		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu",
+		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
+		  p->scx.dsq_vtime);
 	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
 
 	if (SCX_HAS_OP(dump_task)) {
@@ -4663,6 +4746,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
 		if (off >= offsetof(struct task_struct, scx.slice) &&
 		    off + size <= offsetofend(struct task_struct, scx.slice))
 			return SCALAR_VALUE;
+		if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
+		    off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
+			return SCALAR_VALUE;
 		if (off >= offsetof(struct task_struct, scx.disallow) &&
 		    off + size <= offsetofend(struct task_struct, scx.disallow))
 			return SCALAR_VALUE;
@@ -5298,10 +5384,44 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
 	scx_dispatch_commit(p, dsq_id, enq_flags);
 }
 
+/**
+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
+ * @p: task_struct to dispatch
+ * @dsq_id: DSQ to dispatch to
+ * @slice: duration @p can run for in nsecs
+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
+ * Tasks queued into the priority queue are ordered by @vtime and always
+ * consumed after the tasks in the FIFO queue. All other aspects are identical
+ * to scx_bpf_dispatch().
+ *
+ * @vtime ordering is according to time_before64() which considers wrapping. A
+ * numerically larger vtime may indicate an earlier position in the ordering and
+ * vice-versa.
+ */
+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
+					u64 slice, u64 vtime, u64 enq_flags)
+{
+	if (!scx_dispatch_preamble(p, enq_flags))
+		return;
+
+	if (slice)
+		p->scx.slice = slice;
+	else
+		p->scx.slice = p->scx.slice ?: 1;
+
+	p->scx.dsq_vtime = vtime;
+
+	scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 8686f84497db..3fa87084cf17 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -31,6 +31,7 @@ static inline void ___vmlinux_h_sanity_check___(void)
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
 void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index 6bb13a3c801b..ed7e8d535fc5 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -2,11 +2,20 @@
 /*
  * A simple scheduler.
  *
- * A simple global FIFO scheduler. It also demonstrates the following niceties.
+ * By default, it operates as a simple global weighted vtime scheduler and can
+ * be switched to FIFO scheduling. It also demonstrates the following niceties.
  *
  * - Statistics tracking how many tasks are queued to local and global dsq's.
  * - Termination notification for userspace.
  *
+ * While very simple, this scheduler should work reasonably well on CPUs with a
+ * uniform L3 cache topology. While preemption is not implemented, the fact that
+ * the scheduling queue is shared across all CPUs means that whatever is at the
+ * front of the queue is likely to be executed fairly quickly given enough
+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
+ * but comes with the usual problems with FIFO scheduling where saturating
+ * threads can easily drown out interactive ones.
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
@@ -15,8 +24,20 @@
 
 char _license[] SEC("license") = "GPL";
 
+const volatile bool fifo_sched;
+
+static u64 vtime_now;
 UEI_DEFINE(uei);
 
+/*
+ * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
+ * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We
+ * therefore create a separate DSQ with ID 0 that we dispatch to and consume
+ * from. If scx_simple only supported global FIFO scheduling, then we could
+ * just use SCX_DSQ_GLOBAL.
+ */
+#define SHARED_DSQ 0
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(u32));
@@ -31,6 +52,11 @@ static void stat_inc(u32 idx)
 		(*cnt_p)++;
 }
 
+static inline bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
 s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	bool is_idle = false;
@@ -48,7 +74,69 @@ s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 w
 void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	stat_inc(1);	/* count global queueing */
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+
+	if (fifo_sched) {
+		scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+	} else {
+		u64 vtime = p->scx.dsq_vtime;
+
+		/*
+		 * Limit the amount of budget that an idling task can accumulate
+		 * to one slice.
+		 */
+		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
+			vtime = vtime_now - SCX_SLICE_DFL;
+
+		scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
+				       enq_flags);
+	}
+}
+
+void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_consume(SHARED_DSQ);
+}
+
+void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
+{
+	if (fifo_sched)
+		return;
+
+	/*
+	 * Global vtime always progresses forward as tasks start executing. The
+	 * test and update can be performed concurrently from multiple CPUs and
+	 * thus racy. Any error should be contained and temporary. Let's just
+	 * live with it.
+	 */
+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
+		vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
+{
+	if (fifo_sched)
+		return;
+
+	/*
+	 * Scale the execution time by the inverse of the weight and charge.
+	 *
+	 * Note that the default yield implementation yields by setting
+	 * @p->scx.slice to zero and the following would treat the yielding task
+	 * as if it has consumed all its slice. If this penalizes yielding tasks
+	 * too much, determine the execution time by taking explicit timestamps
+	 * instead of depending on @p->scx.slice.
+	 */
+	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+}
+
+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
+{
+	p->scx.dsq_vtime = vtime_now;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
+{
+	return scx_bpf_create_dsq(SHARED_DSQ, -1);
 }
 
 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
@@ -59,5 +147,10 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 SCX_OPS_DEFINE(simple_ops,
 	       .select_cpu		= (void *)simple_select_cpu,
 	       .enqueue			= (void *)simple_enqueue,
+	       .dispatch		= (void *)simple_dispatch,
+	       .running			= (void *)simple_running,
+	       .stopping		= (void *)simple_stopping,
+	       .enable			= (void *)simple_enable,
+	       .init			= (void *)simple_init,
 	       .exit			= (void *)simple_exit,
 	       .name			= "simple");
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index bead482e1383..76d83199545c 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -17,8 +17,9 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-v]\n"
+"Usage: %s [-f] [-v]\n"
 "\n"
+"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
 
@@ -70,8 +71,11 @@ int main(int argc, char **argv)
 restart:
 	skel = SCX_OPS_OPEN(simple_ops, scx_simple);
 
-	while ((opt = getopt(argc, argv, "vh")) != -1) {
+	while ((opt = getopt(argc, argv, "fvh")) != -1) {
 		switch (opt) {
+		case 'f':
+			skel->rodata->fifo_sched = true;
+			break;
 		case 'v':
 			verbose = true;
 			break;
-- 
cgit v1.2.3


From fa48e8d2c7b58d242c1db3a09c14f4274e055087 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:21 -1000
Subject: sched_ext: Documentation: scheduler: Document extensible scheduler
 class

Add Documentation/scheduler/sched-ext.rst which gives a high-level overview
and pointers to the examples.

v6: - Add paragraph explaining debug dump.

v5: - Updated to reflect /sys/kernel interface change. Kconfig options
      added.

v4: - README improved, reformatted in markdown and renamed to README.md.

v3: - Added tools/sched_ext/README.

    - Dropped _example prefix from scheduler names.

v2: - Apply minor edits suggested by Bagas. Caveats section dropped as all
      of them are addressed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
---
 Documentation/scheduler/index.rst     |   1 +
 Documentation/scheduler/sched-ext.rst | 314 ++++++++++++++++++++++++++++++++++
 include/linux/sched/ext.h             |   2 +
 kernel/Kconfig.preempt                |   1 +
 kernel/sched/ext.c                    |   2 +
 kernel/sched/ext.h                    |   2 +
 tools/sched_ext/README.md             | 258 ++++++++++++++++++++++++++++
 7 files changed, 580 insertions(+)
 create mode 100644 Documentation/scheduler/sched-ext.rst
 create mode 100644 tools/sched_ext/README.md

(limited to 'tools')

diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
index 43bd8a145b7a..0611dc3dda8e 100644
--- a/Documentation/scheduler/index.rst
+++ b/Documentation/scheduler/index.rst
@@ -20,6 +20,7 @@ Scheduler
     sched-nice-design
     sched-rt-group
     sched-stats
+    sched-ext
     sched-debug
 
     text_files
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
new file mode 100644
index 000000000000..497eeaa5ecbe
--- /dev/null
+++ b/Documentation/scheduler/sched-ext.rst
@@ -0,0 +1,314 @@
+==========================
+Extensible Scheduler Class
+==========================
+
+sched_ext is a scheduler class whose behavior can be defined by a set of BPF
+programs - the BPF scheduler.
+
+* sched_ext exports a full scheduling interface so that any scheduling
+  algorithm can be implemented on top.
+
+* The BPF scheduler can group CPUs however it sees fit and schedule them
+  together, as tasks aren't tied to specific CPUs at the time of wakeup.
+
+* The BPF scheduler can be turned on and off dynamically anytime.
+
+* The system integrity is maintained no matter what the BPF scheduler does.
+  The default scheduling behavior is restored anytime an error is detected,
+  a runnable task stalls, or on invoking the SysRq key sequence
+  :kbd:`SysRq-S`.
+
+* When the BPF scheduler triggers an error, debug information is dumped to
+  aid debugging. The debug dump is passed to and printed out by the
+  scheduler binary. The debug dump can also be accessed through the
+  `sched_ext_dump` tracepoint. The SysRq key sequence :kbd:`SysRq-D`
+  triggers a debug dump. This doesn't terminate the BPF scheduler and can
+  only be read through the tracepoint.
+
+Switching to and from sched_ext
+===============================
+
+``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and
+``tools/sched_ext`` contains the example schedulers. The following config
+options should be enabled to use sched_ext:
+
+.. code-block:: none
+
+    CONFIG_BPF=y
+    CONFIG_SCHED_CLASS_EXT=y
+    CONFIG_BPF_SYSCALL=y
+    CONFIG_BPF_JIT=y
+    CONFIG_DEBUG_INFO_BTF=y
+    CONFIG_BPF_JIT_ALWAYS_ON=y
+    CONFIG_BPF_JIT_DEFAULT_ON=y
+    CONFIG_PAHOLE_HAS_SPLIT_BTF=y
+    CONFIG_PAHOLE_HAS_BTF_TAG=y
+
+sched_ext is used only when the BPF scheduler is loaded and running.
+
+If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be
+treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is
+loaded. On load, such tasks will be switched to and scheduled by sched_ext.
+
+The BPF scheduler can choose to schedule all normal and lower class tasks by
+calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this
+case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and
+``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers,
+this mode can be selected with the ``-a`` option.
+
+Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or
+detection of any internal error including stalled runnable tasks aborts the
+BPF scheduler and reverts all tasks back to CFS.
+
+.. code-block:: none
+
+    # make -j16 -C tools/sched_ext
+    # tools/sched_ext/scx_simple
+    local=0 global=3
+    local=5 global=24
+    local=9 global=44
+    local=13 global=56
+    local=17 global=72
+    ^CEXIT: BPF scheduler unregistered
+
+The current status of the BPF scheduler can be determined as follows:
+
+.. code-block:: none
+
+    # cat /sys/kernel/sched_ext/state
+    enabled
+    # cat /sys/kernel/sched_ext/root/ops
+    simple
+
+``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more
+detailed information:
+
+.. code-block:: none
+
+    # tools/sched_ext/scx_show_state.py
+    ops           : simple
+    enabled       : 1
+    switching_all : 1
+    switched_all  : 1
+    enable_state  : enabled (2)
+    bypass_depth  : 0
+    nr_rejected   : 0
+
+If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can
+be determined as follows:
+
+.. code-block:: none
+
+    # grep ext /proc/self/sched
+    ext.enabled                                  :                    1
+
+The Basics
+==========
+
+Userspace can implement an arbitrary BPF scheduler by loading a set of BPF
+programs that implement ``struct sched_ext_ops``. The only mandatory field
+is ``ops.name`` which must be a valid BPF object name. All operations are
+optional. The following modified excerpt is from
+``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler.
+
+.. code-block:: c
+
+    /*
+     * Decide which CPU a task should be migrated to before being
+     * enqueued (either at wakeup, fork time, or exec time). If an
+     * idle core is found by the default ops.select_cpu() implementation,
+     * then dispatch the task directly to SCX_DSQ_LOCAL and skip the
+     * ops.enqueue() callback.
+     *
+     * Note that this implementation has exactly the same behavior as the
+     * default ops.select_cpu implementation. The behavior of the scheduler
+     * would be exactly same if the implementation just didn't define the
+     * simple_select_cpu() struct_ops prog.
+     */
+    s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p,
+                       s32 prev_cpu, u64 wake_flags)
+    {
+            s32 cpu;
+            /* Need to initialize or the BPF verifier will reject the program */
+            bool direct = false;
+
+            cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct);
+
+            if (direct)
+                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+
+            return cpu;
+    }
+
+    /*
+     * Do a direct dispatch of a task to the global DSQ. This ops.enqueue()
+     * callback will only be invoked if we failed to find a core to dispatch
+     * to in ops.select_cpu() above.
+     *
+     * Note that this implementation has exactly the same behavior as the
+     * default ops.enqueue implementation, which just dispatches the task
+     * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same
+     * if the implementation just didn't define the simple_enqueue struct_ops
+     * prog.
+     */
+    void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+    {
+            scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+    }
+
+    s32 BPF_STRUCT_OPS(simple_init)
+    {
+            /*
+             * All SCHED_OTHER, SCHED_IDLE, and SCHED_BATCH tasks should
+             * use sched_ext.
+             */
+            scx_bpf_switch_all();
+            return 0;
+    }
+
+    void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
+    {
+            exit_type = ei->type;
+    }
+
+    SEC(".struct_ops")
+    struct sched_ext_ops simple_ops = {
+            .select_cpu             = (void *)simple_select_cpu,
+            .enqueue                = (void *)simple_enqueue,
+            .init                   = (void *)simple_init,
+            .exit                   = (void *)simple_exit,
+            .name                   = "simple",
+    };
+
+Dispatch Queues
+---------------
+
+To match the impedance between the scheduler core and the BPF scheduler,
+sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a
+priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``),
+and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage
+an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and
+``scx_bpf_destroy_dsq()``.
+
+A CPU always executes a task from its local DSQ. A task is "dispatched" to a
+DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's
+local DSQ.
+
+When a CPU is looking for the next task to run, if the local DSQ is not
+empty, the first task is picked. Otherwise, the CPU tries to consume the
+global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()``
+is invoked.
+
+Scheduling Cycle
+----------------
+
+The following briefly shows how a waking task is scheduled and executed.
+
+1. When a task is waking up, ``ops.select_cpu()`` is the first operation
+   invoked. This serves two purposes. First, CPU selection optimization
+   hint. Second, waking up the selected CPU if idle.
+
+   The CPU selected by ``ops.select_cpu()`` is an optimization hint and not
+   binding. The actual decision is made at the last step of scheduling.
+   However, there is a small performance gain if the CPU
+   ``ops.select_cpu()`` returns matches the CPU the task eventually runs on.
+
+   A side-effect of selecting a CPU is waking it up from idle. While a BPF
+   scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper,
+   using ``ops.select_cpu()`` judiciously can be simpler and more efficient.
+
+   A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by
+   calling ``scx_bpf_dispatch()``. If the task is dispatched to
+   ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the
+   local DSQ of whichever CPU is returned from ``ops.select_cpu()``.
+   Additionally, dispatching directly from ``ops.select_cpu()`` will cause the
+   ``ops.enqueue()`` callback to be skipped.
+
+   Note that the scheduler core will ignore an invalid CPU selection, for
+   example, if it's outside the allowed cpumask of the task.
+
+2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the
+   task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()``
+   can make one of the following decisions:
+
+   * Immediately dispatch the task to either the global or local DSQ by
+     calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or
+     ``SCX_DSQ_LOCAL``, respectively.
+
+   * Immediately dispatch the task to a custom DSQ by calling
+     ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63.
+
+   * Queue the task on the BPF side.
+
+3. When a CPU is ready to schedule, it first looks at its local DSQ. If
+   empty, it then looks at the global DSQ. If there still isn't a task to
+   run, ``ops.dispatch()`` is invoked which can use the following two
+   functions to populate the local DSQ.
+
+   * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can
+     be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``,
+     ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()``
+     currently can't be called with BPF locks held, this is being worked on
+     and will be supported. ``scx_bpf_dispatch()`` schedules dispatching
+     rather than performing them immediately. There can be up to
+     ``ops.dispatch_max_batch`` pending tasks.
+
+   * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ
+     to the dispatching DSQ. This function cannot be called with any BPF
+     locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks
+     before trying to consume the specified DSQ.
+
+4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ,
+   the CPU runs the first one. If empty, the following steps are taken:
+
+   * Try to consume the global DSQ. If successful, run the task.
+
+   * If ``ops.dispatch()`` has dispatched any tasks, retry #3.
+
+   * If the previous task is an SCX task and still runnable, keep executing
+     it (see ``SCX_OPS_ENQ_LAST``).
+
+   * Go idle.
+
+Note that the BPF scheduler can always choose to dispatch tasks immediately
+in ``ops.enqueue()`` as illustrated in the above simple example. If only the
+built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as
+a task is never queued on the BPF scheduler and both the local and global
+DSQs are consumed automatically.
+
+``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use
+``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as
+``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue
+dispatching, and must be dispatched to with ``scx_bpf_dispatch()``.  See the
+function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for
+more information.
+
+Where to Look
+=============
+
+* ``include/linux/sched/ext.h`` defines the core data structures, ops table
+  and constants.
+
+* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers.
+  The functions prefixed with ``scx_bpf_`` can be called from the BPF
+  scheduler.
+
+* ``tools/sched_ext/`` hosts example BPF scheduler implementations.
+
+  * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a
+    custom DSQ.
+
+  * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
+    levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
+
+ABI Instability
+===============
+
+The APIs provided by sched_ext to BPF schedulers programs have no stability
+guarantees. This includes the ops table callbacks and constants defined in
+``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
+``kernel/sched/ext.c``.
+
+While we will attempt to provide a relatively stable API surface when
+possible, they are subject to change without warning between kernel
+versions.
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 9cee193dab19..fe9a67ffe6b1 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 7dde5e424ac3..f035c87d02f1 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -156,4 +156,5 @@ config SCHED_CLASS_EXT
 	  similar to struct sched_class.
 
 	  For more information:
+	    Documentation/scheduler/sched-ext.rst
 	    https://github.com/sched-ext/scx
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f186c576e7d9..f814e84ceeb3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 6555878c5da3..c41d742b5d62 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
new file mode 100644
index 000000000000..8efe70cc4363
--- /dev/null
+++ b/tools/sched_ext/README.md
@@ -0,0 +1,258 @@
+SCHED_EXT EXAMPLE SCHEDULERS
+============================
+
+# Introduction
+
+This directory contains a number of example sched_ext schedulers. These
+schedulers are meant to provide examples of different types of schedulers
+that can be built using sched_ext, and illustrate how various features of
+sched_ext can be used.
+
+Some of the examples are performant, production-ready schedulers. That is, for
+the correct workload and with the correct tuning, they may be deployed in a
+production environment with acceptable or possibly even improved performance.
+Others are just examples that in practice, would not provide acceptable
+performance (though they could be improved to get there).
+
+This README will describe these example schedulers, including describing the
+types of workloads or scenarios they're designed to accommodate, and whether or
+not they're production ready. For more details on any of these schedulers,
+please see the header comment in their .bpf.c file.
+
+
+# Compiling the examples
+
+There are a few toolchain dependencies for compiling the example schedulers.
+
+## Toolchain dependencies
+
+1. clang >= 16.0.0
+
+The schedulers are BPF programs, and therefore must be compiled with clang. gcc
+is actively working on adding a BPF backend compiler as well, but are still
+missing some features such as BTF type tags which are necessary for using
+kptrs.
+
+2. pahole >= 1.25
+
+You may need pahole in order to generate BTF from DWARF.
+
+3. rust >= 1.70.0
+
+Rust schedulers uses features present in the rust toolchain >= 1.70.0. You
+should be able to use the stable build from rustup, but if that doesn't
+work, try using the rustup nightly build.
+
+There are other requirements as well, such as make, but these are the main /
+non-trivial ones.
+
+## Compiling the kernel
+
+In order to run a sched_ext scheduler, you'll have to run a kernel compiled
+with the patches in this repository, and with a minimum set of necessary
+Kconfig options:
+
+```
+CONFIG_BPF=y
+CONFIG_SCHED_CLASS_EXT=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_DEBUG_INFO_BTF=y
+```
+
+It's also recommended that you also include the following Kconfig options:
+
+```
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_JIT_DEFAULT_ON=y
+CONFIG_PAHOLE_HAS_SPLIT_BTF=y
+CONFIG_PAHOLE_HAS_BTF_TAG=y
+```
+
+There is a `Kconfig` file in this directory whose contents you can append to
+your local `.config` file, as long as there are no conflicts with any existing
+options in the file.
+
+## Getting a vmlinux.h file
+
+You may notice that most of the example schedulers include a "vmlinux.h" file.
+This is a large, auto-generated header file that contains all of the types
+defined in some vmlinux binary that was compiled with
+[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig
+options specified above).
+
+The header file is created using `bpftool`, by passing it a vmlinux binary
+compiled with BTF as follows:
+
+```bash
+$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h
+```
+
+`bpftool` analyzes all of the BTF encodings in the binary, and produces a
+header file that can be included by BPF programs to access those types.  For
+example, using vmlinux.h allows a scheduler to access fields defined directly
+in vmlinux as follows:
+
+```c
+#include "vmlinux.h"
+// vmlinux.h is also implicitly included by scx_common.bpf.h.
+#include "scx_common.bpf.h"
+
+/*
+ * vmlinux.h provides definitions for struct task_struct and
+ * struct scx_enable_args.
+ */
+void BPF_STRUCT_OPS(example_enable, struct task_struct *p,
+		    struct scx_enable_args *args)
+{
+	bpf_printk("Task %s enabled in example scheduler", p->comm);
+}
+
+// vmlinux.h provides the definition for struct sched_ext_ops.
+SEC(".struct_ops.link")
+struct sched_ext_ops example_ops {
+	.enable	= (void *)example_enable,
+	.name	= "example",
+}
+```
+
+The scheduler build system will generate this vmlinux.h file as part of the
+scheduler build pipeline. It looks for a vmlinux file in the following
+dependency order:
+
+1. If the O= environment variable is defined, at `$O/vmlinux`
+2. If the KBUILD_OUTPUT= environment variable is defined, at
+   `$KBUILD_OUTPUT/vmlinux`
+3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're
+   compiling the schedulers)
+3. `/sys/kernel/btf/vmlinux`
+4. `/boot/vmlinux-$(uname -r)`
+
+In other words, if you have compiled a kernel in your local repo, its vmlinux
+file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of
+the kernel you're currently running on. This means that if you're running on a
+kernel with sched_ext support, you may not need to compile a local kernel at
+all.
+
+### Aside on CO-RE
+
+One of the cooler features of BPF is that it supports
+[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run
+Everywhere). This feature allows you to reference fields inside of structs with
+types defined internal to the kernel, and not have to recompile if you load the
+BPF program on a different kernel with the field at a different offset. In our
+example above, we print out a task name with `p->comm`. CO-RE would perform
+relocations for that access when the program is loaded to ensure that it's
+referencing the correct offset for the currently running kernel.
+
+## Compiling the schedulers
+
+Once you have your toolchain setup, and a vmlinux that can be used to generate
+a full vmlinux.h file, you can compile the schedulers using `make`:
+
+```bash
+$ make -j($nproc)
+```
+
+# Example schedulers
+
+This directory contains the following example schedulers. These schedulers are
+for testing and demonstrating different aspects of sched_ext. While some may be
+useful in limited scenarios, they are not intended to be practical.
+
+For more scheduler implementations, tools and documentation, visit
+https://github.com/sched-ext/scx.
+
+## scx_simple
+
+A simple scheduler that provides an example of a minimal sched_ext scheduler.
+scx_simple can be run in either global weighted vtime mode, or FIFO mode.
+
+Though very simple, in limited scenarios, this scheduler can perform reasonably
+well on single-socket systems with a unified L3 cache.
+
+## scx_qmap
+
+Another simple, yet slightly more complex scheduler that provides an example of
+a basic weighted FIFO queuing policy. It also provides examples of some common
+useful BPF features, such as sleepable per-task storage allocation in the
+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
+enqueue tasks. It also illustrates how core-sched support could be implemented.
+
+## scx_central
+
+A "central" scheduler where scheduling decisions are made from a single CPU.
+This scheduler illustrates how scheduling decisions can be dispatched from a
+single CPU, allowing other cores to run with infinite slices, without timer
+ticks, and without having to incur the overhead of making scheduling decisions.
+
+The approach demonstrated by this scheduler may be useful for any workload that
+benefits from minimizing scheduling overhead and timer ticks. An example of
+where this could be particularly useful is running VMs, where running with
+infinite slices and no timer ticks allows the VM to avoid unnecessary expensive
+vmexits.
+
+
+# Troubleshooting
+
+There are a number of common issues that you may run into when building the
+schedulers. We'll go over some of the common ones here.
+
+## Build Failures
+
+### Old version of clang
+
+```
+error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole
+        _Static_assert(SCX_DSQ_FLAG_BUILTIN,
+                       ^~~~~~~~~~~~~~~~~~~~
+1 error generated.
+```
+
+This means you built the kernel or the schedulers with an older version of
+clang than what's supported (i.e. older than 16.0.0). To remediate this:
+
+1. `which clang` to make sure you're using a sufficiently new version of clang.
+
+2. `make fullclean` in the root path of the repository, and rebuild the kernel
+   and schedulers.
+
+3. Rebuild the kernel, and then your example schedulers.
+
+The schedulers are also cleaned if you invoke `make mrproper` in the root
+directory of the tree.
+
+### Stale kernel build / incomplete vmlinux.h file
+
+As described above, you'll need a `vmlinux.h` file that was generated from a
+vmlinux built with BTF, and with sched_ext support enabled. If you don't,
+you'll see errors such as the following which indicate that a type being
+referenced in a scheduler is unknown:
+
+```
+/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info'
+
+const struct scx_exit_info *ei)
+
+^
+```
+
+In order to resolve this, please follow the steps above in
+[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your
+schedulers are using a vmlinux.h file that includes the requisite types.
+
+## Misc
+
+### llvm: [OFF]
+
+You may see the following output when building the schedulers:
+
+```
+Auto-detecting system features:
+...                         clang-bpf-co-re: [ on  ]
+...                                    llvm: [ OFF ]
+...                                  libcap: [ on  ]
+...                                  libbfd: [ on  ]
+```
+
+Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore.
-- 
cgit v1.2.3


From a5db7817af780db6a7f290c79677eff4fd13c5fa Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Tue, 18 Jun 2024 10:09:21 -1000
Subject: sched_ext: Add selftests

Add basic selftests.

Signed-off-by: David Vernet <dvernet@meta.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/.gitignore       |   6 +
 tools/testing/selftests/sched_ext/Makefile         | 218 +++++++++++++++++++++
 tools/testing/selftests/sched_ext/config           |   9 +
 tools/testing/selftests/sched_ext/create_dsq.bpf.c |  58 ++++++
 tools/testing/selftests/sched_ext/create_dsq.c     |  57 ++++++
 .../selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c  |  42 ++++
 .../selftests/sched_ext/ddsp_bogus_dsq_fail.c      |  57 ++++++
 .../selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c |  39 ++++
 .../selftests/sched_ext/ddsp_vtimelocal_fail.c     |  56 ++++++
 .../testing/selftests/sched_ext/dsp_local_on.bpf.c |  65 ++++++
 tools/testing/selftests/sched_ext/dsp_local_on.c   |  58 ++++++
 .../sched_ext/enq_last_no_enq_fails.bpf.c          |  21 ++
 .../selftests/sched_ext/enq_last_no_enq_fails.c    |  60 ++++++
 .../selftests/sched_ext/enq_select_cpu_fails.bpf.c |  43 ++++
 .../selftests/sched_ext/enq_select_cpu_fails.c     |  61 ++++++
 tools/testing/selftests/sched_ext/exit.bpf.c       |  84 ++++++++
 tools/testing/selftests/sched_ext/exit.c           |  55 ++++++
 tools/testing/selftests/sched_ext/exit_test.h      |  20 ++
 tools/testing/selftests/sched_ext/hotplug.bpf.c    |  61 ++++++
 tools/testing/selftests/sched_ext/hotplug.c        | 168 ++++++++++++++++
 tools/testing/selftests/sched_ext/hotplug_test.h   |  15 ++
 .../selftests/sched_ext/init_enable_count.bpf.c    |  53 +++++
 .../selftests/sched_ext/init_enable_count.c        | 166 ++++++++++++++++
 tools/testing/selftests/sched_ext/maximal.bpf.c    | 132 +++++++++++++
 tools/testing/selftests/sched_ext/maximal.c        |  51 +++++
 tools/testing/selftests/sched_ext/maybe_null.bpf.c |  36 ++++
 tools/testing/selftests/sched_ext/maybe_null.c     |  49 +++++
 .../selftests/sched_ext/maybe_null_fail_dsp.bpf.c  |  25 +++
 .../selftests/sched_ext/maybe_null_fail_yld.bpf.c  |  28 +++
 tools/testing/selftests/sched_ext/minimal.bpf.c    |  21 ++
 tools/testing/selftests/sched_ext/minimal.c        |  58 ++++++
 tools/testing/selftests/sched_ext/prog_run.bpf.c   |  32 +++
 tools/testing/selftests/sched_ext/prog_run.c       |  78 ++++++++
 tools/testing/selftests/sched_ext/reload_loop.c    |  75 +++++++
 tools/testing/selftests/sched_ext/runner.c         | 201 +++++++++++++++++++
 tools/testing/selftests/sched_ext/scx_test.h       | 131 +++++++++++++
 .../selftests/sched_ext/select_cpu_dfl.bpf.c       |  40 ++++
 tools/testing/selftests/sched_ext/select_cpu_dfl.c |  72 +++++++
 .../sched_ext/select_cpu_dfl_nodispatch.bpf.c      |  89 +++++++++
 .../sched_ext/select_cpu_dfl_nodispatch.c          |  72 +++++++
 .../selftests/sched_ext/select_cpu_dispatch.bpf.c  |  41 ++++
 .../selftests/sched_ext/select_cpu_dispatch.c      |  70 +++++++
 .../sched_ext/select_cpu_dispatch_bad_dsq.bpf.c    |  37 ++++
 .../sched_ext/select_cpu_dispatch_bad_dsq.c        |  56 ++++++
 .../sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c    |  38 ++++
 .../sched_ext/select_cpu_dispatch_dbl_dsp.c        |  56 ++++++
 .../selftests/sched_ext/select_cpu_vtime.bpf.c     |  92 +++++++++
 .../testing/selftests/sched_ext/select_cpu_vtime.c |  59 ++++++
 tools/testing/selftests/sched_ext/test_example.c   |  49 +++++
 tools/testing/selftests/sched_ext/util.c           |  71 +++++++
 tools/testing/selftests/sched_ext/util.h           |  13 ++
 51 files changed, 3244 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/.gitignore
 create mode 100644 tools/testing/selftests/sched_ext/Makefile
 create mode 100644 tools/testing/selftests/sched_ext/config
 create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c
 create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c
 create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c
 create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c
 create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
 create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
 create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/exit.c
 create mode 100644 tools/testing/selftests/sched_ext/exit_test.h
 create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/hotplug.c
 create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h
 create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c
 create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/maximal.c
 create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c
 create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/minimal.c
 create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/prog_run.c
 create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c
 create mode 100644 tools/testing/selftests/sched_ext/runner.c
 create mode 100644 tools/testing/selftests/sched_ext/scx_test.h
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c
 create mode 100644 tools/testing/selftests/sched_ext/test_example.c
 create mode 100644 tools/testing/selftests/sched_ext/util.c
 create mode 100644 tools/testing/selftests/sched_ext/util.h

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/.gitignore b/tools/testing/selftests/sched_ext/.gitignore
new file mode 100644
index 000000000000..ae5491a114c0
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/.gitignore
@@ -0,0 +1,6 @@
+*
+!*.c
+!*.h
+!Makefile
+!.gitignore
+!config
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
new file mode 100644
index 000000000000..0754a2c110a1
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -0,0 +1,218 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+include ../../../build/Build.include
+include ../../../scripts/Makefile.arch
+include ../../../scripts/Makefile.include
+include ../lib.mk
+
+ifneq ($(LLVM),)
+ifneq ($(filter %/,$(LLVM)),)
+LLVM_PREFIX := $(LLVM)
+else ifneq ($(filter -%,$(LLVM)),)
+LLVM_SUFFIX := $(LLVM)
+endif
+
+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
+else
+CC := gcc
+endif # LLVM
+
+ifneq ($(CROSS_COMPILE),)
+$(error CROSS_COMPILE not supported for scx selftests)
+endif # CROSS_COMPILE
+
+CURDIR := $(abspath .)
+REPOROOT := $(abspath ../../../..)
+TOOLSDIR := $(REPOROOT)/tools
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
+GENDIR := $(REPOROOT)/include/generated
+GENHDR := $(GENDIR)/autoconf.h
+SCXTOOLSDIR := $(TOOLSDIR)/sched_ext
+SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include
+
+OUTPUT_DIR := $(CURDIR)/build
+OBJ_DIR := $(OUTPUT_DIR)/obj
+INCLUDE_DIR := $(OUTPUT_DIR)/include
+BPFOBJ_DIR := $(OBJ_DIR)/libbpf
+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
+LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a
+DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool
+HOST_BUILD_DIR := $(OBJ_DIR)
+HOST_OUTPUT_DIR := $(OUTPUT_DIR)
+
+VMLINUX_BTF_PATHS ?= ../../../../vmlinux					\
+		     /sys/kernel/btf/vmlinux					\
+		     /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+ifneq ($(wildcard $(GENHDR)),)
+  GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
+	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR)
+
+# Silence some warnings when compiled with clang
+ifneq ($(LLVM),)
+CFLAGS += -Wno-unused-command-line-argument
+endif
+
+LDFLAGS = -lelf -lz -lpthread -lzstd
+
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
+	     -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat			\
+	     -I$(INCLUDE_DIR) -I$(APIDIR) -I$(SCXTOOLSINCDIR)			\
+	     -I$(REPOROOT)/include						\
+	     $(call get_sys_includes,$(CLANG))					\
+	     -Wall -Wno-compare-distinct-pointer-types				\
+	     -Wno-incompatible-function-pointer-types				\
+	     -O2 -mcpu=v3
+
+# sort removes libbpf duplicates when not cross-building
+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(OBJ_DIR)/libbpf				\
+	       $(OBJ_DIR)/bpftool $(OBJ_DIR)/resolve_btfids			\
+	       $(INCLUDE_DIR) $(SCXOBJ_DIR))
+
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
+	   $(APIDIR)/linux/bpf.h						\
+	   | $(OBJ_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/	\
+		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
+		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
+
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
+		    $(LIBBPF_OUTPUT) | $(OBJ_DIR)/bpftool
+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
+		    EXTRA_CFLAGS='-g -O0'					\
+		    OUTPUT=$(OBJ_DIR)/bpftool/					\
+		    LIBBPF_OUTPUT=$(OBJ_DIR)/libbpf/				\
+		    LIBBPF_DESTDIR=$(OUTPUT_DIR)/				\
+		    prefix= DESTDIR=$(OUTPUT_DIR)/ install-bin
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+	$(call msg,GEN,,$@)
+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+	$(call msg,CP,,$@)
+	$(Q)cp "$(VMLINUX_H)" $@
+endif
+
+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h	| $(BPFOBJ) $(SCXOBJ_DIR)
+	$(call msg,CLNG-BPF,,$(notdir $@))
+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+
+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR)
+	$(eval sched=$(notdir $@))
+	$(call msg,GEN-SKEL,,$(sched))
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
+
+################
+# C schedulers #
+################
+
+override define CLEAN
+	rm -rf $(OUTPUT_DIR)
+	rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
+	rm -f $(TEST_GEN_PROGS)
+	rm -f runner
+endef
+
+# Every testcase takes all of the BPF progs are dependencies by default. This
+# allows testcases to load any BPF scheduler, which is useful for testcases
+# that don't need their own prog to run their test.
+all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog)))
+
+auto-test-targets :=			\
+	create_dsq			\
+	enq_last_no_enq_fails		\
+	enq_select_cpu_fails		\
+	ddsp_bogus_dsq_fail		\
+	ddsp_vtimelocal_fail		\
+	dsp_local_on			\
+	exit				\
+	hotplug				\
+	init_enable_count		\
+	maximal				\
+	maybe_null			\
+	minimal				\
+	prog_run			\
+	reload_loop			\
+	select_cpu_dfl			\
+	select_cpu_dfl_nodispatch	\
+	select_cpu_dispatch		\
+	select_cpu_dispatch_bad_dsq	\
+	select_cpu_dispatch_dbl_dsp	\
+	select_cpu_vtime		\
+	test_example			\
+
+testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
+
+$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# Create all of the test targets object files, whose testcase objects will be
+# registered into the runner in ELF constructors.
+#
+# Note that we must do double expansion here in order to support conditionally
+# compiling BPF object files only if one is present, as the wildcard Make
+# function doesn't support using implicit rules otherwise.
+$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR)
+	$(eval test=$(patsubst %.o,%.c,$(notdir $@)))
+	$(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o
+
+$(SCXOBJ_DIR)/util.o: util.c | $(SCXOBJ_DIR)
+	$(CC) $(CFLAGS) -c $< -o $@
+
+runner: $(SCXOBJ_DIR)/runner.o $(SCXOBJ_DIR)/util.o $(BPFOBJ) $(testcase-targets)
+	@echo "$(testcase-targets)"
+	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
+TEST_GEN_PROGS := runner
+
+all: runner
+
+.PHONY: all clean help
+
+.DEFAULT_GOAL := all
+
+.DELETE_ON_ERROR:
+
+.SECONDARY:
diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config
new file mode 100644
index 000000000000..0de9b4ee249d
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/config
@@ -0,0 +1,9 @@
+CONFIG_SCHED_DEBUG=y
+CONFIG_SCHED_CLASS_EXT=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_EXT_GROUP_SCHED=y
+CONFIG_BPF=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_BTF=y
diff --git a/tools/testing/selftests/sched_ext/create_dsq.bpf.c b/tools/testing/selftests/sched_ext/create_dsq.bpf.c
new file mode 100644
index 000000000000..23f79ed343f0
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Create and destroy DSQs in a loop.
+ *
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+void BPF_STRUCT_OPS(create_dsq_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
+{
+	scx_bpf_destroy_dsq(p->pid);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init_task, struct task_struct *p,
+			     struct scx_init_task_args *args)
+{
+	s32 err;
+
+	err = scx_bpf_create_dsq(p->pid, -1);
+	if (err)
+		scx_bpf_error("Failed to create DSQ for %s[%d]",
+			      p->comm, p->pid);
+
+	return err;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init)
+{
+	u32 i;
+	s32 err;
+
+	bpf_for(i, 0, 1024) {
+		err = scx_bpf_create_dsq(i, -1);
+		if (err) {
+			scx_bpf_error("Failed to create DSQ %d", i);
+			return 0;
+		}
+	}
+
+	bpf_for(i, 0, 1024) {
+		scx_bpf_destroy_dsq(i);
+	}
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops create_dsq_ops = {
+	.init_task		= create_dsq_init_task,
+	.exit_task		= create_dsq_exit_task,
+	.init			= create_dsq_init,
+	.name			= "create_dsq",
+};
diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c
new file mode 100644
index 000000000000..fa946d9146d4
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/create_dsq.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "create_dsq.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct create_dsq *skel;
+
+	skel = create_dsq__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct create_dsq *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.create_dsq_ops);
+	if (!link) {
+		SCX_ERR("Failed to attach scheduler");
+		return SCX_TEST_FAIL;
+	}
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct create_dsq *skel = ctx;
+
+	create_dsq__destroy(skel);
+}
+
+struct scx_test create_dsq = {
+	.name = "create_dsq",
+	.description = "Create and destroy a dsq in a loop",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&create_dsq)
diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
new file mode 100644
index 000000000000..e97ad41d354a
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+
+	if (cpu >= 0) {
+		/*
+		 * If we dispatch to a bogus DSQ that will fall back to the
+		 * builtin global DSQ, we fail gracefully.
+		 */
+		scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL,
+				       p->scx.dsq_vtime, 0);
+		return cpu;
+	}
+
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops ddsp_bogus_dsq_fail_ops = {
+	.select_cpu		= ddsp_bogus_dsq_fail_select_cpu,
+	.exit			= ddsp_bogus_dsq_fail_exit,
+	.name			= "ddsp_bogus_dsq_fail",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c
new file mode 100644
index 000000000000..e65d22f23f3b
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "ddsp_bogus_dsq_fail.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct ddsp_bogus_dsq_fail *skel;
+
+	skel = ddsp_bogus_dsq_fail__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct ddsp_bogus_dsq_fail *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
+
+	sleep(1);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct ddsp_bogus_dsq_fail *skel = ctx;
+
+	ddsp_bogus_dsq_fail__destroy(skel);
+}
+
+struct scx_test ddsp_bogus_dsq_fail = {
+	.name = "ddsp_bogus_dsq_fail",
+	.description = "Verify we gracefully fail, and fall back to using a "
+		       "built-in DSQ, if we do a direct dispatch to an invalid"
+		       " DSQ in ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail)
diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
new file mode 100644
index 000000000000..dde7e7dafbfb
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+
+	if (cpu >= 0) {
+		/* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */
+		scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
+				       p->scx.dsq_vtime, 0);
+		return cpu;
+	}
+
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops ddsp_vtimelocal_fail_ops = {
+	.select_cpu		= ddsp_vtimelocal_fail_select_cpu,
+	.exit			= ddsp_vtimelocal_fail_exit,
+	.name			= "ddsp_vtimelocal_fail",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c
new file mode 100644
index 000000000000..abafee587cd6
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <unistd.h>
+#include "ddsp_vtimelocal_fail.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct ddsp_vtimelocal_fail *skel;
+
+	skel = ddsp_vtimelocal_fail__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct ddsp_vtimelocal_fail *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
+
+	sleep(1);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct ddsp_vtimelocal_fail *skel = ctx;
+
+	ddsp_vtimelocal_fail__destroy(skel);
+}
+
+struct scx_test ddsp_vtimelocal_fail = {
+	.name = "ddsp_vtimelocal_fail",
+	.description = "Verify we gracefully fail, and fall back to using a "
+		       "built-in DSQ, if we do a direct vtime dispatch to a "
+		       "built-in DSQ from DSQ in ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&ddsp_vtimelocal_fail)
diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
new file mode 100644
index 000000000000..efb4672decb4
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+const volatile s32 nr_cpus;
+
+UEI_DEFINE(uei);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 8192);
+	__type(value, s32);
+} queue SEC(".maps");
+
+s32 BPF_STRUCT_OPS(dsp_local_on_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(dsp_local_on_enqueue, struct task_struct *p,
+		    u64 enq_flags)
+{
+	s32 pid = p->pid;
+
+	if (bpf_map_push_elem(&queue, &pid, 0))
+		scx_bpf_error("Failed to enqueue %s[%d]", p->comm, p->pid);
+}
+
+void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev)
+{
+	s32 pid, target;
+	struct task_struct *p;
+
+	if (bpf_map_pop_elem(&queue, &pid))
+		return;
+
+	p = bpf_task_from_pid(pid);
+	if (!p)
+		return;
+
+	target = bpf_get_prandom_u32() % nr_cpus;
+
+	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0);
+	bpf_task_release(p);
+}
+
+void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dsp_local_on_ops = {
+	.select_cpu		= dsp_local_on_select_cpu,
+	.enqueue		= dsp_local_on_enqueue,
+	.dispatch		= dsp_local_on_dispatch,
+	.exit			= dsp_local_on_exit,
+	.name			= "dsp_local_on",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c
new file mode 100644
index 000000000000..472851b56854
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.c
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <unistd.h>
+#include "dsp_local_on.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dsp_local_on *skel;
+
+	skel = dsp_local_on__open();
+	SCX_FAIL_IF(!skel, "Failed to open");
+
+	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+	SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dsp_local_on *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dsp_local_on_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
+
+	/* Just sleeping is fine, plenty of scheduling events happening */
+	sleep(1);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dsp_local_on *skel = ctx;
+
+	dsp_local_on__destroy(skel);
+}
+
+struct scx_test dsp_local_on = {
+	.name = "dsp_local_on",
+	.description = "Verify we can directly dispatch tasks to a local DSQs "
+		       "from osp.dispatch()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&dsp_local_on)
diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
new file mode 100644
index 000000000000..b0b99531d5d5
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC(".struct_ops.link")
+struct sched_ext_ops enq_last_no_enq_fails_ops = {
+	.name			= "enq_last_no_enq_fails",
+	/* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */
+	.flags			= SCX_OPS_ENQ_LAST,
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
new file mode 100644
index 000000000000..2a3eda5e2c0b
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "enq_last_no_enq_fails.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct enq_last_no_enq_fails *skel;
+
+	skel = enq_last_no_enq_fails__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct enq_last_no_enq_fails *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops);
+	if (link) {
+		SCX_ERR("Incorrectly succeeded in to attaching scheduler");
+		return SCX_TEST_FAIL;
+	}
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct enq_last_no_enq_fails *skel = ctx;
+
+	enq_last_no_enq_fails__destroy(skel);
+}
+
+struct scx_test enq_last_no_enq_fails = {
+	.name = "enq_last_no_enq_fails",
+	.description = "Verify we fail to load a scheduler if we specify "
+		       "the SCX_OPS_ENQ_LAST flag without defining "
+		       "ops.enqueue()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&enq_last_no_enq_fails)
diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
new file mode 100644
index 000000000000..b3dfc1033cd6
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* Manually specify the signature until the kfunc is added to the scx repo. */
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+			   bool *found) __ksym;
+
+s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
+		    u64 enq_flags)
+{
+	/*
+	 * Need to initialize the variable or the verifier will fail to load.
+	 * Improving these semantics is actively being worked on.
+	 */
+	bool found = false;
+
+	/* Can only call from ops.select_cpu() */
+	scx_bpf_select_cpu_dfl(p, 0, 0, &found);
+
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops enq_select_cpu_fails_ops = {
+	.select_cpu		= enq_select_cpu_fails_select_cpu,
+	.enqueue		= enq_select_cpu_fails_enqueue,
+	.name			= "enq_select_cpu_fails",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
new file mode 100644
index 000000000000..dd1350e5f002
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "enq_select_cpu_fails.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct enq_select_cpu_fails *skel;
+
+	skel = enq_select_cpu_fails__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct enq_select_cpu_fails *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops);
+	if (!link) {
+		SCX_ERR("Failed to attach scheduler");
+		return SCX_TEST_FAIL;
+	}
+
+	sleep(1);
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct enq_select_cpu_fails *skel = ctx;
+
+	enq_select_cpu_fails__destroy(skel);
+}
+
+struct scx_test enq_select_cpu_fails = {
+	.name = "enq_select_cpu_fails",
+	.description = "Verify we fail to call scx_bpf_select_cpu_dfl() "
+		       "from ops.enqueue()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&enq_select_cpu_fails)
diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c
new file mode 100644
index 000000000000..ae12ddaac921
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/exit.bpf.c
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+#include "exit_test.h"
+
+const volatile int exit_point;
+UEI_DEFINE(uei);
+
+#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point)
+
+s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	bool found;
+
+	if (exit_point == EXIT_SELECT_CPU)
+		EXIT_CLEANLY();
+
+	return scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
+}
+
+void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (exit_point == EXIT_ENQUEUE)
+		EXIT_CLEANLY();
+
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
+{
+	if (exit_point == EXIT_DISPATCH)
+		EXIT_CLEANLY();
+
+	scx_bpf_consume(SCX_DSQ_GLOBAL);
+}
+
+void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
+{
+	if (exit_point == EXIT_ENABLE)
+		EXIT_CLEANLY();
+}
+
+s32 BPF_STRUCT_OPS(exit_init_task, struct task_struct *p,
+		    struct scx_init_task_args *args)
+{
+	if (exit_point == EXIT_INIT_TASK)
+		EXIT_CLEANLY();
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(exit_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init)
+{
+	if (exit_point == EXIT_INIT)
+		EXIT_CLEANLY();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops exit_ops = {
+	.select_cpu		= exit_select_cpu,
+	.enqueue		= exit_enqueue,
+	.dispatch		= exit_dispatch,
+	.init_task		= exit_init_task,
+	.enable			= exit_enable,
+	.exit			= exit_exit,
+	.init			= exit_init,
+	.name			= "exit",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c
new file mode 100644
index 000000000000..31bcd06e21cd
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/exit.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <sched.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "exit.bpf.skel.h"
+#include "scx_test.h"
+
+#include "exit_test.h"
+
+static enum scx_test_status run(void *ctx)
+{
+	enum exit_test_case tc;
+
+	for (tc = 0; tc < NUM_EXITS; tc++) {
+		struct exit *skel;
+		struct bpf_link *link;
+		char buf[16];
+
+		skel = exit__open();
+		skel->rodata->exit_point = tc;
+		exit__load(skel);
+		link = bpf_map__attach_struct_ops(skel->maps.exit_ops);
+		if (!link) {
+			SCX_ERR("Failed to attach scheduler");
+			exit__destroy(skel);
+			return SCX_TEST_FAIL;
+		}
+
+		/* Assumes uei.kind is written last */
+		while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE))
+			sched_yield();
+
+		SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF));
+		SCX_EQ(skel->data->uei.exit_code, tc);
+		sprintf(buf, "%d", tc);
+		SCX_ASSERT(!strcmp(skel->data->uei.msg, buf));
+		bpf_link__destroy(link);
+		exit__destroy(skel);
+	}
+
+	return SCX_TEST_PASS;
+}
+
+struct scx_test exit_test = {
+	.name = "exit",
+	.description = "Verify we can cleanly exit a scheduler in multiple places",
+	.run = run,
+};
+REGISTER_SCX_TEST(&exit_test)
diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h
new file mode 100644
index 000000000000..94f0268b9cb8
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/exit_test.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+
+#ifndef __EXIT_TEST_H__
+#define __EXIT_TEST_H__
+
+enum exit_test_case {
+	EXIT_SELECT_CPU,
+	EXIT_ENQUEUE,
+	EXIT_DISPATCH,
+	EXIT_ENABLE,
+	EXIT_INIT_TASK,
+	EXIT_INIT,
+	NUM_EXITS,
+};
+
+#endif  // # __EXIT_TEST_H__
diff --git a/tools/testing/selftests/sched_ext/hotplug.bpf.c b/tools/testing/selftests/sched_ext/hotplug.bpf.c
new file mode 100644
index 000000000000..8f2601db39f3
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+#include "hotplug_test.h"
+
+UEI_DEFINE(uei);
+
+void BPF_STRUCT_OPS(hotplug_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+static void exit_from_hotplug(s32 cpu, bool onlining)
+{
+	/*
+	 * Ignored, just used to verify that we can invoke blocking kfuncs
+	 * from the hotplug path.
+	 */
+	scx_bpf_create_dsq(0, -1);
+
+	s64 code = SCX_ECODE_ACT_RESTART | HOTPLUG_EXIT_RSN;
+
+	if (onlining)
+		code |= HOTPLUG_ONLINING;
+
+	scx_bpf_exit(code, "hotplug event detected (%d going %s)", cpu,
+		     onlining ? "online" : "offline");
+}
+
+void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_online, s32 cpu)
+{
+	exit_from_hotplug(cpu, true);
+}
+
+void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu)
+{
+	exit_from_hotplug(cpu, false);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops hotplug_cb_ops = {
+	.cpu_online		= hotplug_cpu_online,
+	.cpu_offline		= hotplug_cpu_offline,
+	.exit			= hotplug_exit,
+	.name			= "hotplug_cbs",
+	.timeout_ms		= 1000U,
+};
+
+SEC(".struct_ops.link")
+struct sched_ext_ops hotplug_nocb_ops = {
+	.exit			= hotplug_exit,
+	.name			= "hotplug_nocbs",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c
new file mode 100644
index 000000000000..87bf220b1bce
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/hotplug.c
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <sched.h>
+#include <scx/common.h>
+#include <sched.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "hotplug_test.h"
+#include "hotplug.bpf.skel.h"
+#include "scx_test.h"
+#include "util.h"
+
+const char *online_path = "/sys/devices/system/cpu/cpu1/online";
+
+static bool is_cpu_online(void)
+{
+	return file_read_long(online_path) > 0;
+}
+
+static void toggle_online_status(bool online)
+{
+	long val = online ? 1 : 0;
+	int ret;
+
+	ret = file_write_long(online_path, val);
+	if (ret != 0)
+		fprintf(stderr, "Failed to bring CPU %s (%s)",
+			online ? "online" : "offline", strerror(errno));
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	if (!is_cpu_online())
+		return SCX_TEST_SKIP;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined)
+{
+	struct hotplug *skel;
+	struct bpf_link *link;
+	long kind, code;
+
+	SCX_ASSERT(is_cpu_online());
+
+	skel = hotplug__open_and_load();
+	SCX_ASSERT(skel);
+
+	/* Testing the offline -> online path, so go offline before starting */
+	if (onlining)
+		toggle_online_status(0);
+
+	if (cbs_defined) {
+		kind = SCX_KIND_VAL(SCX_EXIT_UNREG_BPF);
+		code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | HOTPLUG_EXIT_RSN;
+		if (onlining)
+			code |= HOTPLUG_ONLINING;
+	} else {
+		kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN);
+		code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) |
+		       SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG);
+	}
+
+	if (cbs_defined)
+		link = bpf_map__attach_struct_ops(skel->maps.hotplug_cb_ops);
+	else
+		link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops);
+
+	if (!link) {
+		SCX_ERR("Failed to attach scheduler");
+		hotplug__destroy(skel);
+		return SCX_TEST_FAIL;
+	}
+
+	toggle_online_status(onlining ? 1 : 0);
+
+	while (!UEI_EXITED(skel, uei))
+		sched_yield();
+
+	SCX_EQ(skel->data->uei.kind, kind);
+	SCX_EQ(UEI_REPORT(skel, uei), code);
+
+	if (!onlining)
+		toggle_online_status(1);
+
+	bpf_link__destroy(link);
+	hotplug__destroy(skel);
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status test_hotplug_attach(void)
+{
+	struct hotplug *skel;
+	struct bpf_link *link;
+	enum scx_test_status status = SCX_TEST_PASS;
+	long kind, code;
+
+	SCX_ASSERT(is_cpu_online());
+	SCX_ASSERT(scx_hotplug_seq() > 0);
+
+	skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug);
+	SCX_ASSERT(skel);
+
+	SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei);
+
+	/*
+	 * Take the CPU offline to increment the global hotplug seq, which
+	 * should cause attach to fail due to us setting the hotplug seq above
+	 */
+	toggle_online_status(0);
+	link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops);
+
+	toggle_online_status(1);
+
+	SCX_ASSERT(link);
+	while (!UEI_EXITED(skel, uei))
+		sched_yield();
+
+	kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN);
+	code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) |
+	       SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG);
+	SCX_EQ(skel->data->uei.kind, kind);
+	SCX_EQ(UEI_REPORT(skel, uei), code);
+
+	bpf_link__destroy(link);
+	hotplug__destroy(skel);
+
+	return status;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+
+#define HP_TEST(__onlining, __cbs_defined) ({				\
+	if (test_hotplug(__onlining, __cbs_defined) != SCX_TEST_PASS)	\
+		return SCX_TEST_FAIL;					\
+})
+
+	HP_TEST(true, true);
+	HP_TEST(false, true);
+	HP_TEST(true, false);
+	HP_TEST(false, false);
+
+#undef HP_TEST
+
+	return test_hotplug_attach();
+}
+
+static void cleanup(void *ctx)
+{
+	toggle_online_status(1);
+}
+
+struct scx_test hotplug_test = {
+	.name = "hotplug",
+	.description = "Verify hotplug behavior",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&hotplug_test)
diff --git a/tools/testing/selftests/sched_ext/hotplug_test.h b/tools/testing/selftests/sched_ext/hotplug_test.h
new file mode 100644
index 000000000000..73d236f90787
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/hotplug_test.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+
+#ifndef __HOTPLUG_TEST_H__
+#define __HOTPLUG_TEST_H__
+
+enum hotplug_test_flags {
+	HOTPLUG_EXIT_RSN = 1LLU << 0,
+	HOTPLUG_ONLINING = 1LLU << 1,
+};
+
+#endif  // # __HOTPLUG_TEST_H__
diff --git a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c
new file mode 100644
index 000000000000..47ea89a626c3
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that verifies that we do proper counting of init, enable, etc
+ * callbacks.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt;
+u64 init_fork_cnt, init_transition_cnt;
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p,
+			     struct scx_init_task_args *args)
+{
+	__sync_fetch_and_add(&init_task_cnt, 1);
+
+	if (args->fork)
+		__sync_fetch_and_add(&init_fork_cnt, 1);
+	else
+		__sync_fetch_and_add(&init_transition_cnt, 1);
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p)
+{
+	__sync_fetch_and_add(&exit_task_cnt, 1);
+}
+
+void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p)
+{
+	__sync_fetch_and_add(&enable_cnt, 1);
+}
+
+void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p)
+{
+	__sync_fetch_and_add(&disable_cnt, 1);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops init_enable_count_ops = {
+	.init_task	= cnt_init_task,
+	.exit_task	= cnt_exit_task,
+	.enable		= cnt_enable,
+	.disable	= cnt_disable,
+	.name		= "init_enable_count",
+};
diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c
new file mode 100644
index 000000000000..97d45f1e5597
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/init_enable_count.c
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <sched.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "scx_test.h"
+#include "init_enable_count.bpf.skel.h"
+
+#define SCHED_EXT 7
+
+static struct init_enable_count *
+open_load_prog(bool global)
+{
+	struct init_enable_count *skel;
+
+	skel = init_enable_count__open();
+	SCX_BUG_ON(!skel, "Failed to open skel");
+
+	if (!global)
+		skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
+
+	SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel");
+
+	return skel;
+}
+
+static enum scx_test_status run_test(bool global)
+{
+	struct init_enable_count *skel;
+	struct bpf_link *link;
+	const u32 num_children = 5, num_pre_forks = 1024;
+	int ret, i, status;
+	struct sched_param param = {};
+	pid_t pids[num_pre_forks];
+
+	skel = open_load_prog(global);
+
+	/*
+	 * Fork a bunch of children before we attach the scheduler so that we
+	 * ensure (at least in practical terms) that there are more tasks that
+	 * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that
+	 * take the fork() path either below or in other processes.
+	 */
+	for (i = 0; i < num_pre_forks; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork child");
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
+
+	for (i = 0; i < num_pre_forks; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for pre-forked child\n");
+
+		SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i,
+			    status);
+	}
+
+	bpf_link__destroy(link);
+	SCX_GE(skel->bss->init_task_cnt, num_pre_forks);
+	SCX_GE(skel->bss->exit_task_cnt, num_pre_forks);
+
+	link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
+
+	/* SCHED_EXT children */
+	for (i = 0; i < num_children; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork child");
+
+		if (pids[i] == 0) {
+			ret = sched_setscheduler(0, SCHED_EXT, &param);
+			SCX_BUG_ON(ret, "Failed to set sched to sched_ext");
+
+			/*
+			 * Reset to SCHED_OTHER for half of them. Counts for
+			 * everything should still be the same regardless, as
+			 * ops.disable() is invoked even if a task is still on
+			 * SCHED_EXT before it exits.
+			 */
+			if (i % 2 == 0) {
+				ret = sched_setscheduler(0, SCHED_OTHER, &param);
+				SCX_BUG_ON(ret, "Failed to reset sched to normal");
+			}
+			exit(0);
+		}
+	}
+	for (i = 0; i < num_children; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for SCX child\n");
+
+		SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i,
+			    status);
+	}
+
+	/* SCHED_OTHER children */
+	for (i = 0; i < num_children; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0)
+			exit(0);
+	}
+
+	for (i = 0; i < num_children; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for normal child\n");
+
+		SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i,
+			    status);
+	}
+
+	bpf_link__destroy(link);
+
+	SCX_GE(skel->bss->init_task_cnt, 2 * num_children);
+	SCX_GE(skel->bss->exit_task_cnt, 2 * num_children);
+
+	if (global) {
+		SCX_GE(skel->bss->enable_cnt, 2 * num_children);
+		SCX_GE(skel->bss->disable_cnt, 2 * num_children);
+	} else {
+		SCX_EQ(skel->bss->enable_cnt, num_children);
+		SCX_EQ(skel->bss->disable_cnt, num_children);
+	}
+	/*
+	 * We forked a ton of tasks before we attached the scheduler above, so
+	 * this should be fine. Technically it could be flaky if a ton of forks
+	 * are happening at the same time in other processes, but that should
+	 * be exceedingly unlikely.
+	 */
+	SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt);
+	SCX_GE(skel->bss->init_fork_cnt, 2 * num_children);
+
+	init_enable_count__destroy(skel);
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	enum scx_test_status status;
+
+	status = run_test(true);
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	return run_test(false);
+}
+
+struct scx_test init_enable_count = {
+	.name = "init_enable_count",
+	.description = "Verify we do the correct amount of counting of init, "
+		       "enable, etc callbacks.",
+	.run = run,
+};
+REGISTER_SCX_TEST(&init_enable_count)
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
new file mode 100644
index 000000000000..44612fdaf399
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler with every callback defined.
+ *
+ * This scheduler defines every callback.
+ *
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu,
+		   u64 wake_flags)
+{
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
+{}
+
+void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_consume(SCX_DSQ_GLOBAL);
+}
+
+void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags)
+{}
+
+void BPF_STRUCT_OPS(maximal_running, struct task_struct *p)
+{}
+
+void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable)
+{}
+
+void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags)
+{}
+
+bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from,
+		    struct task_struct *to)
+{
+	return false;
+}
+
+bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a,
+		    struct task_struct *b)
+{
+	return false;
+}
+
+void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight)
+{}
+
+void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p,
+		    const struct cpumask *cpumask)
+{}
+
+void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle)
+{}
+
+void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu,
+		    struct scx_cpu_acquire_args *args)
+{}
+
+void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu,
+		    struct scx_cpu_release_args *args)
+{}
+
+void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu)
+{}
+
+void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu)
+{}
+
+s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	return 0;
+}
+
+void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p)
+{}
+
+void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
+{}
+
+void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p)
+{}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
+{
+	return 0;
+}
+
+void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info)
+{}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops maximal_ops = {
+	.select_cpu		= maximal_select_cpu,
+	.enqueue		= maximal_enqueue,
+	.dequeue		= maximal_dequeue,
+	.dispatch		= maximal_dispatch,
+	.runnable		= maximal_runnable,
+	.running		= maximal_running,
+	.stopping		= maximal_stopping,
+	.quiescent		= maximal_quiescent,
+	.yield			= maximal_yield,
+	.core_sched_before	= maximal_core_sched_before,
+	.set_weight		= maximal_set_weight,
+	.set_cpumask		= maximal_set_cpumask,
+	.update_idle		= maximal_update_idle,
+	.cpu_acquire		= maximal_cpu_acquire,
+	.cpu_release		= maximal_cpu_release,
+	.cpu_online		= maximal_cpu_online,
+	.cpu_offline		= maximal_cpu_offline,
+	.init_task		= maximal_init_task,
+	.enable			= maximal_enable,
+	.exit_task		= maximal_exit_task,
+	.disable		= maximal_disable,
+	.init			= maximal_init,
+	.exit			= maximal_exit,
+	.name			= "maximal",
+};
diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c
new file mode 100644
index 000000000000..f38fc973c380
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/maximal.c
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "maximal.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct maximal *skel;
+
+	skel = maximal__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct maximal *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.maximal_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct maximal *skel = ctx;
+
+	maximal__destroy(skel);
+}
+
+struct scx_test maximal = {
+	.name = "maximal",
+	.description = "Verify we can load a scheduler with every callback defined",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&maximal)
diff --git a/tools/testing/selftests/sched_ext/maybe_null.bpf.c b/tools/testing/selftests/sched_ext/maybe_null.bpf.c
new file mode 100644
index 000000000000..27d0f386acfb
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+u64 vtime_test;
+
+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p)
+{}
+
+void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p)
+{
+	if (p != NULL)
+		vtime_test = p->scx.dsq_vtime;
+}
+
+bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from,
+		    struct task_struct *to)
+{
+	if (to)
+		bpf_printk("Yielding to %s[%d]", to->comm, to->pid);
+
+	return false;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops maybe_null_success = {
+	.dispatch               = maybe_null_success_dispatch,
+	.yield			= maybe_null_success_yield,
+	.enable			= maybe_null_running,
+	.name			= "minimal",
+};
diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c
new file mode 100644
index 000000000000..31cfafb0cf65
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/maybe_null.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "maybe_null.bpf.skel.h"
+#include "maybe_null_fail_dsp.bpf.skel.h"
+#include "maybe_null_fail_yld.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status run(void *ctx)
+{
+	struct maybe_null *skel;
+	struct maybe_null_fail_dsp *fail_dsp;
+	struct maybe_null_fail_yld *fail_yld;
+
+	skel = maybe_null__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load maybe_null skel");
+		return SCX_TEST_FAIL;
+	}
+	maybe_null__destroy(skel);
+
+	fail_dsp = maybe_null_fail_dsp__open_and_load();
+	if (fail_dsp) {
+		maybe_null_fail_dsp__destroy(fail_dsp);
+		SCX_ERR("Should failed to open and load maybe_null_fail_dsp skel");
+		return SCX_TEST_FAIL;
+	}
+
+	fail_yld = maybe_null_fail_yld__open_and_load();
+	if (fail_yld) {
+		maybe_null_fail_yld__destroy(fail_yld);
+		SCX_ERR("Should failed to open and load maybe_null_fail_yld skel");
+		return SCX_TEST_FAIL;
+	}
+
+	return SCX_TEST_PASS;
+}
+
+struct scx_test maybe_null = {
+	.name = "maybe_null",
+	.description = "Verify if PTR_MAYBE_NULL work for .dispatch",
+	.run = run,
+};
+REGISTER_SCX_TEST(&maybe_null)
diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
new file mode 100644
index 000000000000..c0641050271d
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+u64 vtime_test;
+
+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p)
+{}
+
+void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p)
+{
+	vtime_test = p->scx.dsq_vtime;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops maybe_null_fail = {
+	.dispatch               = maybe_null_fail_dispatch,
+	.enable			= maybe_null_running,
+	.name			= "maybe_null_fail_dispatch",
+};
diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
new file mode 100644
index 000000000000..3c1740028e3b
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+u64 vtime_test;
+
+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p)
+{}
+
+bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from,
+		    struct task_struct *to)
+{
+	bpf_printk("Yielding to %s[%d]", to->comm, to->pid);
+
+	return false;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops maybe_null_fail = {
+	.yield			= maybe_null_fail_yield,
+	.enable			= maybe_null_running,
+	.name			= "maybe_null_fail_yield",
+};
diff --git a/tools/testing/selftests/sched_ext/minimal.bpf.c b/tools/testing/selftests/sched_ext/minimal.bpf.c
new file mode 100644
index 000000000000..6a7eccef0104
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/minimal.bpf.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A completely minimal scheduler.
+ *
+ * This scheduler defines the absolute minimal set of struct sched_ext_ops
+ * fields: its name. It should _not_ fail to be loaded, and can be used to
+ * exercise the default scheduling paths in ext.c.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC(".struct_ops.link")
+struct sched_ext_ops minimal_ops = {
+	.name			= "minimal",
+};
diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c
new file mode 100644
index 000000000000..6c5db8ebbf8a
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/minimal.c
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "minimal.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct minimal *skel;
+
+	skel = minimal__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct minimal *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.minimal_ops);
+	if (!link) {
+		SCX_ERR("Failed to attach scheduler");
+		return SCX_TEST_FAIL;
+	}
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct minimal *skel = ctx;
+
+	minimal__destroy(skel);
+}
+
+struct scx_test minimal = {
+	.name = "minimal",
+	.description = "Verify we can load a fully minimal scheduler",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&minimal)
diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c
new file mode 100644
index 000000000000..fd2c8f12af16
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates that we can invoke sched_ext kfuncs in
+ * BPF_PROG_TYPE_SYSCALL programs.
+ *
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+
+#include <scx/common.bpf.h>
+
+UEI_DEFINE(uei);
+
+char _license[] SEC("license") = "GPL";
+
+SEC("syscall")
+int BPF_PROG(prog_run_syscall)
+{
+	scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN");
+	return 0;
+}
+
+void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops prog_run_ops = {
+	.exit			= prog_run_exit,
+	.name			= "prog_run",
+};
diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c
new file mode 100644
index 000000000000..3cd57ef8daaa
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/prog_run.c
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <sched.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "prog_run.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct prog_run *skel;
+
+	skel = prog_run__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct prog_run *skel = ctx;
+	struct bpf_link *link;
+	int prog_fd, err = 0;
+
+	prog_fd = bpf_program__fd(skel->progs.prog_run_syscall);
+	if (prog_fd < 0) {
+		SCX_ERR("Failed to get BPF_PROG_RUN prog");
+		return SCX_TEST_FAIL;
+	}
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+	link = bpf_map__attach_struct_ops(skel->maps.prog_run_ops);
+	if (!link) {
+		SCX_ERR("Failed to attach scheduler");
+		close(prog_fd);
+		return SCX_TEST_FAIL;
+	}
+
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	SCX_EQ(err, 0);
+
+	/* Assumes uei.kind is written last */
+	while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE))
+		sched_yield();
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF));
+	SCX_EQ(skel->data->uei.exit_code, 0xdeadbeef);
+	close(prog_fd);
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct prog_run *skel = ctx;
+
+	prog_run__destroy(skel);
+}
+
+struct scx_test prog_run = {
+	.name = "prog_run",
+	.description = "Verify we can call into a scheduler with BPF_PROG_RUN, and invoke kfuncs",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&prog_run)
diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c
new file mode 100644
index 000000000000..5cfba2d6e056
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/reload_loop.c
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <pthread.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "maximal.bpf.skel.h"
+#include "scx_test.h"
+
+static struct maximal *skel;
+static pthread_t threads[2];
+
+bool force_exit = false;
+
+static enum scx_test_status setup(void **ctx)
+{
+	skel = maximal__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+
+	return SCX_TEST_PASS;
+}
+
+static void *do_reload_loop(void *arg)
+{
+	u32 i;
+
+	for (i = 0; i < 1024 && !force_exit; i++) {
+		struct bpf_link *link;
+
+		link = bpf_map__attach_struct_ops(skel->maps.maximal_ops);
+		if (link)
+			bpf_link__destroy(link);
+	}
+
+	return NULL;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	int err;
+	void *ret;
+
+	err = pthread_create(&threads[0], NULL, do_reload_loop, NULL);
+	SCX_FAIL_IF(err, "Failed to create thread 0");
+
+	err = pthread_create(&threads[1], NULL, do_reload_loop, NULL);
+	SCX_FAIL_IF(err, "Failed to create thread 1");
+
+	SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed");
+	SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	force_exit = true;
+	maximal__destroy(skel);
+}
+
+struct scx_test reload_loop = {
+	.name = "reload_loop",
+	.description = "Stress test loading and unloading schedulers repeatedly in a tight loop",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&reload_loop)
diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
new file mode 100644
index 000000000000..eab48c7ff309
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/runner.c
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "scx_test.h"
+
+const char help_fmt[] =
+"The runner for sched_ext tests.\n"
+"\n"
+"The runner is statically linked against all testcases, and runs them all serially.\n"
+"It's required for the testcases to be serial, as only a single host-wide sched_ext\n"
+"scheduler may be loaded at any given time."
+"\n"
+"Usage: %s [-t TEST] [-h]\n"
+"\n"
+"  -t TEST       Only run tests whose name includes this string\n"
+"  -s            Include print output for skipped tests\n"
+"  -q            Don't print the test descriptions during run\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+static bool quiet, print_skipped;
+
+#define MAX_SCX_TESTS 2048
+
+static struct scx_test __scx_tests[MAX_SCX_TESTS];
+static unsigned __scx_num_tests = 0;
+
+static void sigint_handler(int simple)
+{
+	exit_req = 1;
+}
+
+static void print_test_preamble(const struct scx_test *test, bool quiet)
+{
+	printf("===== START =====\n");
+	printf("TEST: %s\n", test->name);
+	if (!quiet)
+		printf("DESCRIPTION: %s\n", test->description);
+	printf("OUTPUT:\n");
+}
+
+static const char *status_to_result(enum scx_test_status status)
+{
+	switch (status) {
+	case SCX_TEST_PASS:
+	case SCX_TEST_SKIP:
+		return "ok";
+	case SCX_TEST_FAIL:
+		return "not ok";
+	default:
+		return "<UNKNOWN>";
+	}
+}
+
+static void print_test_result(const struct scx_test *test,
+			      enum scx_test_status status,
+			      unsigned int testnum)
+{
+	const char *result = status_to_result(status);
+	const char *directive = status == SCX_TEST_SKIP ? "SKIP " : "";
+
+	printf("%s %u %s # %s\n", result, testnum, test->name, directive);
+	printf("=====  END  =====\n");
+}
+
+static bool should_skip_test(const struct scx_test *test, const char * filter)
+{
+	return !strstr(test->name, filter);
+}
+
+static enum scx_test_status run_test(const struct scx_test *test)
+{
+	enum scx_test_status status;
+	void *context = NULL;
+
+	if (test->setup) {
+		status = test->setup(&context);
+		if (status != SCX_TEST_PASS)
+			return status;
+	}
+
+	status = test->run(context);
+
+	if (test->cleanup)
+		test->cleanup(context);
+
+	return status;
+}
+
+static bool test_valid(const struct scx_test *test)
+{
+	if (!test) {
+		fprintf(stderr, "NULL test detected\n");
+		return false;
+	}
+
+	if (!test->name) {
+		fprintf(stderr,
+			"Test with no name found. Must specify test name.\n");
+		return false;
+	}
+
+	if (!test->description) {
+		fprintf(stderr, "Test %s requires description.\n", test->name);
+		return false;
+	}
+
+	if (!test->run) {
+		fprintf(stderr, "Test %s has no run() callback\n", test->name);
+		return false;
+	}
+
+	return true;
+}
+
+int main(int argc, char **argv)
+{
+	const char *filter = NULL;
+	unsigned testnum = 0, i;
+	unsigned passed = 0, skipped = 0, failed = 0;
+	int opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	while ((opt = getopt(argc, argv, "qst:h")) != -1) {
+		switch (opt) {
+		case 'q':
+			quiet = true;
+			break;
+		case 's':
+			print_skipped = true;
+			break;
+		case 't':
+			filter = optarg;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	for (i = 0; i < __scx_num_tests; i++) {
+		enum scx_test_status status;
+		struct scx_test *test = &__scx_tests[i];
+
+		if (filter && should_skip_test(test, filter)) {
+			/*
+			 * Printing the skipped tests and their preambles can
+			 * add a lot of noise to the runner output. Printing
+			 * this is only really useful for CI, so let's skip it
+			 * by default.
+			 */
+			if (print_skipped) {
+				print_test_preamble(test, quiet);
+				print_test_result(test, SCX_TEST_SKIP, ++testnum);
+			}
+			continue;
+		}
+
+		print_test_preamble(test, quiet);
+		status = run_test(test);
+		print_test_result(test, status, ++testnum);
+		switch (status) {
+		case SCX_TEST_PASS:
+			passed++;
+			break;
+		case SCX_TEST_SKIP:
+			skipped++;
+			break;
+		case SCX_TEST_FAIL:
+			failed++;
+			break;
+		}
+	}
+	printf("\n\n=============================\n\n");
+	printf("RESULTS:\n\n");
+	printf("PASSED:  %u\n", passed);
+	printf("SKIPPED: %u\n", skipped);
+	printf("FAILED:  %u\n", failed);
+
+	return 0;
+}
+
+void scx_test_register(struct scx_test *test)
+{
+	SCX_BUG_ON(!test_valid(test), "Invalid test found");
+	SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded");
+
+	__scx_tests[__scx_num_tests++] = *test;
+}
diff --git a/tools/testing/selftests/sched_ext/scx_test.h b/tools/testing/selftests/sched_ext/scx_test.h
new file mode 100644
index 000000000000..90b8d6915bb7
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/scx_test.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+
+#ifndef __SCX_TEST_H__
+#define __SCX_TEST_H__
+
+#include <errno.h>
+#include <scx/common.h>
+#include <scx/compat.h>
+
+enum scx_test_status {
+	SCX_TEST_PASS = 0,
+	SCX_TEST_SKIP,
+	SCX_TEST_FAIL,
+};
+
+#define EXIT_KIND(__ent) __COMPAT_ENUM_OR_ZERO("scx_exit_kind", #__ent)
+
+struct scx_test {
+	/**
+	 * name - The name of the testcase.
+	 */
+	const char *name;
+
+	/**
+	 * description - A description of your testcase: what it tests and is
+	 * meant to validate.
+	 */
+	const char *description;
+
+	/*
+	 * setup - Setup the test.
+	 * @ctx: A pointer to a context object that will be passed to run and
+	 *	 cleanup.
+	 *
+	 * An optional callback that allows a testcase to perform setup for its
+	 * run. A test may return SCX_TEST_SKIP to skip the run.
+	 */
+	enum scx_test_status (*setup)(void **ctx);
+
+	/*
+	 * run - Run the test.
+	 * @ctx: Context set in the setup() callback. If @ctx was not set in
+	 *	 setup(), it is NULL.
+	 *
+	 * The main test. Callers should return one of:
+	 *
+	 * - SCX_TEST_PASS: Test passed
+	 * - SCX_TEST_SKIP: Test should be skipped
+	 * - SCX_TEST_FAIL: Test failed
+	 *
+	 * This callback must be defined.
+	 */
+	enum scx_test_status (*run)(void *ctx);
+
+	/*
+	 * cleanup - Perform cleanup following the test
+	 * @ctx: Context set in the setup() callback. If @ctx was not set in
+	 *	 setup(), it is NULL.
+	 *
+	 * An optional callback that allows a test to perform cleanup after
+	 * being run. This callback is run even if the run() callback returns
+	 * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns
+	 * SCX_TEST_SKIP or SCX_TEST_FAIL.
+	 */
+	void (*cleanup)(void *ctx);
+};
+
+void scx_test_register(struct scx_test *test);
+
+#define REGISTER_SCX_TEST(__test)			\
+	__attribute__((constructor))			\
+	static void ___scxregister##__LINE__(void)	\
+	{						\
+		scx_test_register(__test);		\
+	}
+
+#define SCX_ERR(__fmt, ...)						\
+	do {								\
+		fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__);	\
+		fprintf(stderr, __fmt"\n", ##__VA_ARGS__);			\
+	} while (0)
+
+#define SCX_FAIL(__fmt, ...)						\
+	do {								\
+		SCX_ERR(__fmt, ##__VA_ARGS__);				\
+		return SCX_TEST_FAIL;					\
+	} while (0)
+
+#define SCX_FAIL_IF(__cond, __fmt, ...)					\
+	do {								\
+		if (__cond)						\
+			SCX_FAIL(__fmt, ##__VA_ARGS__);			\
+	} while (0)
+
+#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)",		\
+				   #_x, (u64)(_x))
+
+#define SCX_ECODE_VAL(__ecode) ({						\
+        u64 __val = 0;								\
+	bool __found = false;							\
+										\
+	__found = __COMPAT_read_enum("scx_exit_code", #__ecode, &__val);	\
+	SCX_ASSERT(__found);							\
+	(s64)__val;								\
+})
+
+#define SCX_KIND_VAL(__kind) ({							\
+        u64 __val = 0;								\
+	bool __found = false;							\
+										\
+	__found = __COMPAT_read_enum("scx_exit_kind", #__kind, &__val);		\
+	SCX_ASSERT(__found);							\
+	__val;									\
+})
+
+#endif  // # __SCX_TEST_H__
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
new file mode 100644
index 000000000000..2ed2991afafe
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+bool saw_local = false;
+
+static bool task_is_test(const struct task_struct *p)
+{
+	return !bpf_strncmp(p->comm, 9, "select_cpu");
+}
+
+void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
+		    u64 enq_flags)
+{
+	const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask();
+
+	if (task_is_test(p) &&
+	    bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) {
+		saw_local = true;
+	}
+	scx_bpf_put_idle_cpumask(idle_mask);
+
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dfl_ops = {
+	.enqueue		= select_cpu_dfl_enqueue,
+	.name			= "select_cpu_dfl",
+};
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
new file mode 100644
index 000000000000..a53a40c2d2f0
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "select_cpu_dfl.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_CHILDREN 1028
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct select_cpu_dfl *skel;
+
+	skel = select_cpu_dfl__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dfl *skel = ctx;
+	struct bpf_link *link;
+	pid_t pids[NUM_CHILDREN];
+	int i, status;
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
+		SCX_EQ(status, 0);
+	}
+
+	SCX_ASSERT(!skel->bss->saw_local);
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dfl *skel = ctx;
+
+	select_cpu_dfl__destroy(skel);
+}
+
+struct scx_test select_cpu_dfl = {
+	.name = "select_cpu_dfl",
+	.description = "Verify the default ops.select_cpu() dispatches tasks "
+		       "when idles cores are found, and skips ops.enqueue()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dfl)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
new file mode 100644
index 000000000000..4bb5abb2d369
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag
+ * specified.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+bool saw_local = false;
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool	force_local;	/* CPU changed by ops.select_cpu() */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+/* Manually specify the signature until the kfunc is added to the scx repo. */
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+			   bool *found) __ksym;
+
+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+	s32 cpu;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return -ESRCH;
+	}
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags,
+				     &tctx->force_local);
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p,
+		    u64 enq_flags)
+{
+	u64 dsq_id = SCX_DSQ_GLOBAL;
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	if (tctx->force_local) {
+		dsq_id = SCX_DSQ_LOCAL;
+		tctx->force_local = false;
+		saw_local = true;
+	}
+
+	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,
+		   struct task_struct *p, struct scx_init_task_args *args)
+{
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dfl_nodispatch_ops = {
+	.select_cpu		= select_cpu_dfl_nodispatch_select_cpu,
+	.enqueue		= select_cpu_dfl_nodispatch_enqueue,
+	.init_task		= select_cpu_dfl_nodispatch_init_task,
+	.name			= "select_cpu_dfl_nodispatch",
+};
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c
new file mode 100644
index 000000000000..1d85bf4bf3a3
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "select_cpu_dfl_nodispatch.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_CHILDREN 1028
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct select_cpu_dfl_nodispatch *skel;
+
+	skel = select_cpu_dfl_nodispatch__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dfl_nodispatch *skel = ctx;
+	struct bpf_link *link;
+	pid_t pids[NUM_CHILDREN];
+	int i, status;
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
+		SCX_EQ(status, 0);
+	}
+
+	SCX_ASSERT(skel->bss->saw_local);
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dfl_nodispatch *skel = ctx;
+
+	select_cpu_dfl_nodispatch__destroy(skel);
+}
+
+struct scx_test select_cpu_dfl_nodispatch = {
+	.name = "select_cpu_dfl_nodispatch",
+	.description = "Verify behavior of scx_bpf_select_cpu_dfl() in "
+		       "ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
new file mode 100644
index 000000000000..f0b96a4a04b2
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	u64 dsq_id = SCX_DSQ_LOCAL;
+	s32 cpu = prev_cpu;
+
+	if (scx_bpf_test_and_clear_cpu_idle(cpu))
+		goto dispatch;
+
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+	if (cpu >= 0)
+		goto dispatch;
+
+	dsq_id = SCX_DSQ_GLOBAL;
+	cpu = prev_cpu;
+
+dispatch:
+	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0);
+	return cpu;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dispatch_ops = {
+	.select_cpu		= select_cpu_dispatch_select_cpu,
+	.name			= "select_cpu_dispatch",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c
new file mode 100644
index 000000000000..0309ca8785b3
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "select_cpu_dispatch.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_CHILDREN 1028
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct select_cpu_dispatch *skel;
+
+	skel = select_cpu_dispatch__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dispatch *skel = ctx;
+	struct bpf_link *link;
+	pid_t pids[NUM_CHILDREN];
+	int i, status;
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
+		SCX_EQ(status, 0);
+	}
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dispatch *skel = ctx;
+
+	select_cpu_dispatch__destroy(skel);
+}
+
+struct scx_test select_cpu_dispatch = {
+	.name = "select_cpu_dispatch",
+	.description = "Test direct dispatching to built-in DSQs from "
+		       "ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dispatch)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
new file mode 100644
index 000000000000..7b42ddce0f56
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/* Dispatching to a random DSQ should fail. */
+	scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0);
+
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = {
+	.select_cpu		= select_cpu_dispatch_bad_dsq_select_cpu,
+	.exit			= select_cpu_dispatch_bad_dsq_exit,
+	.name			= "select_cpu_dispatch_bad_dsq",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c
new file mode 100644
index 000000000000..47eb6ed7627d
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "select_cpu_dispatch_bad_dsq.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct select_cpu_dispatch_bad_dsq *skel;
+
+	skel = select_cpu_dispatch_bad_dsq__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dispatch_bad_dsq *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	sleep(1);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dispatch_bad_dsq *skel = ctx;
+
+	select_cpu_dispatch_bad_dsq__destroy(skel);
+}
+
+struct scx_test select_cpu_dispatch_bad_dsq = {
+	.name = "select_cpu_dispatch_bad_dsq",
+	.description = "Verify graceful failure if we direct-dispatch to a "
+		       "bogus DSQ in ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
new file mode 100644
index 000000000000..653e3dc0b4dc
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/* Dispatching twice in a row is disallowed. */
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = {
+	.select_cpu		= select_cpu_dispatch_dbl_dsp_select_cpu,
+	.exit			= select_cpu_dispatch_dbl_dsp_exit,
+	.name			= "select_cpu_dispatch_dbl_dsp",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c
new file mode 100644
index 000000000000..48ff028a3c46
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct select_cpu_dispatch_dbl_dsp *skel;
+
+	skel = select_cpu_dispatch_dbl_dsp__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dispatch_dbl_dsp *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	sleep(1);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dispatch_dbl_dsp *skel = ctx;
+
+	select_cpu_dispatch_dbl_dsp__destroy(skel);
+}
+
+struct scx_test select_cpu_dispatch_dbl_dsp = {
+	.name = "select_cpu_dispatch_dbl_dsp",
+	.description = "Verify graceful failure if we dispatch twice to a "
+		       "DSQ in ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
new file mode 100644
index 000000000000..7f3ebf4fc2ea
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates that enqueue flags are properly stored and
+ * applied at dispatch time when a task is directly dispatched from
+ * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and
+ * making the test a very basic vtime scheduler.
+ *
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+volatile bool consumed;
+
+static u64 vtime_now;
+
+#define VTIME_DSQ 0
+
+static inline bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+static inline u64 task_vtime(const struct task_struct *p)
+{
+	u64 vtime = p->scx.dsq_vtime;
+
+	if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
+		return vtime_now - SCX_SLICE_DFL;
+	else
+		return vtime;
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	s32 cpu;
+
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+	if (cpu >= 0)
+		goto ddsp;
+
+	cpu = prev_cpu;
+	scx_bpf_test_and_clear_cpu_idle(cpu);
+ddsp:
+	scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0);
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p)
+{
+	if (scx_bpf_consume(VTIME_DSQ))
+		consumed = true;
+}
+
+void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p)
+{
+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
+		vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p,
+		    bool runnable)
+{
+	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+}
+
+void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p)
+{
+	p->scx.dsq_vtime = vtime_now;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)
+{
+	return scx_bpf_create_dsq(VTIME_DSQ, -1);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_vtime_ops = {
+	.select_cpu		= select_cpu_vtime_select_cpu,
+	.dispatch		= select_cpu_vtime_dispatch,
+	.running		= select_cpu_vtime_running,
+	.stopping		= select_cpu_vtime_stopping,
+	.enable			= select_cpu_vtime_enable,
+	.init			= select_cpu_vtime_init,
+	.name			= "select_cpu_vtime",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c
new file mode 100644
index 000000000000..b4629c2364f5
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "select_cpu_vtime.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct select_cpu_vtime *skel;
+
+	skel = select_cpu_vtime__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_vtime *skel = ctx;
+	struct bpf_link *link;
+
+	SCX_ASSERT(!skel->bss->consumed);
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	sleep(1);
+
+	SCX_ASSERT(skel->bss->consumed);
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_vtime *skel = ctx;
+
+	select_cpu_vtime__destroy(skel);
+}
+
+struct scx_test select_cpu_vtime = {
+	.name = "select_cpu_vtime",
+	.description = "Test doing direct vtime-dispatching from "
+		       "ops.select_cpu(), to a non-built-in DSQ",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_vtime)
diff --git a/tools/testing/selftests/sched_ext/test_example.c b/tools/testing/selftests/sched_ext/test_example.c
new file mode 100644
index 000000000000..ce36cdf03cdc
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/test_example.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_test.h"
+
+static bool setup_called = false;
+static bool run_called = false;
+static bool cleanup_called = false;
+
+static int context = 10;
+
+static enum scx_test_status setup(void **ctx)
+{
+	setup_called = true;
+	*ctx = &context;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	int *arg = ctx;
+
+	SCX_ASSERT(setup_called);
+	SCX_ASSERT(!run_called && !cleanup_called);
+	SCX_EQ(*arg, context);
+
+	run_called = true;
+	return SCX_TEST_PASS;
+}
+
+static void cleanup (void *ctx)
+{
+	SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked");
+}
+
+struct scx_test example = {
+	.name		= "example",
+	.description	= "Validate the basic function of the test suite itself",
+	.setup		= setup,
+	.run		= run,
+	.cleanup	= cleanup,
+};
+REGISTER_SCX_TEST(&example)
diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c
new file mode 100644
index 000000000000..e47769c91918
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/util.c
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Returns read len on success, or -errno on failure. */
+static ssize_t read_text(const char *path, char *buf, size_t max_len)
+{
+	ssize_t len;
+	int fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	len = read(fd, buf, max_len - 1);
+
+	if (len >= 0)
+		buf[len] = 0;
+
+	close(fd);
+	return len < 0 ? -errno : len;
+}
+
+/* Returns written len on success, or -errno on failure. */
+static ssize_t write_text(const char *path, char *buf, ssize_t len)
+{
+	int fd;
+	ssize_t written;
+
+	fd = open(path, O_WRONLY | O_APPEND);
+	if (fd < 0)
+		return -errno;
+
+	written = write(fd, buf, len);
+	close(fd);
+	return written < 0 ? -errno : written;
+}
+
+long file_read_long(const char *path)
+{
+	char buf[128];
+
+
+	if (read_text(path, buf, sizeof(buf)) <= 0)
+		return -1;
+
+	return atol(buf);
+}
+
+int file_write_long(const char *path, long val)
+{
+	char buf[64];
+	int ret;
+
+	ret = sprintf(buf, "%lu", val);
+	if (ret < 0)
+		return ret;
+
+	if (write_text(path, buf, sizeof(buf)) <= 0)
+		return -1;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h
new file mode 100644
index 000000000000..bc13dfec1267
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/util.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <void@manifault.com>
+ */
+
+#ifndef __SCX_TEST_UTIL_H__
+#define __SCX_TEST_UTIL_H__
+
+long file_read_long(const char *path);
+int file_write_long(const char *path, long val);
+
+#endif // __SCX_TEST_H__
-- 
cgit v1.2.3


From d86adb4fc0655a0867da811d000df75d2a325ef6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 21 Jun 2024 12:37:22 -1000
Subject: sched_ext: Add cpuperf support

sched_ext currently does not integrate with schedutil. When schedutil is the
governor, frequencies are left unregulated and usually get stuck close to
the highest performance level from running RT tasks.

Add CPU performance monitoring and scaling support by integrating into
schedutil. The following kfuncs are added:

- scx_bpf_cpuperf_cap(): Query the relative performance capacity of
  different CPUs in the system.

- scx_bpf_cpuperf_cur(): Query the current performance level of a CPU
  relative to its max performance.

- scx_bpf_cpuperf_set(): Set the current target performance level of a CPU.

This gives direct control over CPU performance setting to the BPF scheduler.
The only changes on the schedutil side are accounting for the utilization
factor from sched_ext and disabling frequency holding heuristics as it may
not apply well to sched_ext schedulers which may have a lot weaker
connection between tasks and their current / last CPU.

With cpuperf support added, there is no reason to block uclamp. Enable while
at it.

A toy implementation of cpuperf is added to scx_qmap as a demonstration of
the feature.

v2: Ignore cpu_util_cfs_boost() when scx_switched_all() in sugov_get_util()
    to avoid factoring in stale util metric. (Christian)

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Christian Loehle <christian.loehle@arm.com>
---
 kernel/sched/cpufreq_schedutil.c         |  12 ++-
 kernel/sched/ext.c                       |  83 +++++++++++++++++-
 kernel/sched/ext.h                       |   9 ++
 kernel/sched/sched.h                     |   1 +
 tools/sched_ext/include/scx/common.bpf.h |   3 +
 tools/sched_ext/scx_qmap.bpf.c           | 142 ++++++++++++++++++++++++++++++-
 tools/sched_ext/scx_qmap.c               |   8 ++
 7 files changed, 252 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 972b7dd65af2..e683e5d08daa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
 
 static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
 {
-	unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
+	unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
 
+	if (!scx_switched_all())
+		util += cpu_util_cfs_boost(sg_cpu->cpu);
 	util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
 	util = max(util, boost);
 	sg_cpu->bw_min = min;
@@ -330,6 +332,14 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
 	unsigned long idle_calls;
 	bool ret;
 
+	/*
+	 * The heuristics in this function is for the fair class. For SCX, the
+	 * performance target comes directly from the BPF scheduler. Let's just
+	 * follow it.
+	 */
+	if (scx_switched_all())
+		return false;
+
 	/* if capped by uclamp_max, always update to be in compliance */
 	if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
 		return false;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 390623a4a376..28f7a4266fde 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -16,6 +16,8 @@ enum scx_consts {
 	SCX_EXIT_BT_LEN			= 64,
 	SCX_EXIT_MSG_LEN		= 1024,
 	SCX_EXIT_DUMP_DFL_LEN		= 32768,
+
+	SCX_CPUPERF_ONE			= SCHED_CAPACITY_SCALE,
 };
 
 enum scx_exit_kind {
@@ -3520,7 +3522,7 @@ DEFINE_SCHED_CLASS(ext) = {
 	.update_curr		= update_curr_scx,
 
 #ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		= 0,
+	.uclamp_enabled		= 1,
 #endif
 };
 
@@ -4393,7 +4395,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	unsigned long timeout;
-	int i, ret;
+	int i, cpu, ret;
 
 	mutex_lock(&scx_ops_enable_mutex);
 
@@ -4442,6 +4444,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	atomic_long_set(&scx_nr_rejected, 0);
 
+	for_each_possible_cpu(cpu)
+		cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+
 	/*
 	 * Keep CPUs stable during enable so that the BPF scheduler can track
 	 * online CPUs by watching ->on/offline_cpu() after ->init().
@@ -5835,6 +5840,77 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
 		ops_dump_flush();
 }
 
+/**
+ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the maximum relative capacity of @cpu in relation to the most
+ * performant CPU in the system. The return value is in the range [1,
+ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
+{
+	if (ops_cpu_valid(cpu, NULL))
+		return arch_scale_cpu_capacity(cpu);
+	else
+		return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the current relative performance of @cpu in relation to its maximum.
+ * The return value is in the range [1, %SCX_CPUPERF_ONE].
+ *
+ * The current performance level of a CPU in relation to the maximum performance
+ * available in the system can be calculated as follows:
+ *
+ *   scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
+ *
+ * The result is in the range [1, %SCX_CPUPERF_ONE].
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
+{
+	if (ops_cpu_valid(cpu, NULL))
+		return arch_scale_freq_capacity(cpu);
+	else
+		return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
+ * @cpu: CPU of interest
+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ * @flags: %SCX_CPUPERF_* flags
+ *
+ * Set the target performance level of @cpu to @perf. @perf is in linear
+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
+ * schedutil cpufreq governor chooses the target frequency.
+ *
+ * The actual performance level chosen, CPU grouping, and the overhead and
+ * latency of the operations are dependent on the hardware and cpufreq driver in
+ * use. Consult hardware and cpufreq documentation for more information. The
+ * current performance level can be monitored using scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf)
+{
+	if (unlikely(perf > SCX_CPUPERF_ONE)) {
+		scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+		return;
+	}
+
+	if (ops_cpu_valid(cpu, NULL)) {
+		struct rq *rq = cpu_rq(cpu);
+
+		rq->scx.cpuperf_target = perf;
+
+		rcu_read_lock_sched_notrace();
+		cpufreq_update_util(cpu_rq(cpu), 0);
+		rcu_read_unlock_sched_notrace();
+	}
+}
+
 /**
  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
  *
@@ -6045,6 +6121,9 @@ BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index bf6f2cfa49d5..0a7b9a34b18f 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -46,6 +46,14 @@ int scx_check_setscheduler(struct task_struct *p, int policy);
 bool task_should_scx(struct task_struct *p);
 void init_sched_ext_class(void);
 
+static inline u32 scx_cpuperf_target(s32 cpu)
+{
+	if (scx_enabled())
+		return cpu_rq(cpu)->scx.cpuperf_target;
+	else
+		return 0;
+}
+
 static inline const struct sched_class *next_active_class(const struct sched_class *class)
 {
 	class++;
@@ -85,6 +93,7 @@ static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
 static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
 static inline void scx_rq_activate(struct rq *rq) {}
 static inline void scx_rq_deactivate(struct rq *rq) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3989bf8f2a1b..963a2fa180ad 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -743,6 +743,7 @@ struct scx_rq {
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
 	u32			flags;
+	u32			cpuperf_target;		/* [0, SCHED_CAPACITY_SCALE] */
 	bool			cpu_released;
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_kick_if_idle;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 3fa87084cf17..dbbda0e35c5d 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -42,6 +42,9 @@ void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
 void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
 void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
+u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
+u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
 u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
 const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
 const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index c75c70d6a8eb..b1d0b09c966e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -68,6 +68,18 @@ struct {
 	},
 };
 
+/*
+ * If enabled, CPU performance target is set according to the queue index
+ * according to the following table.
+ */
+static const u32 qidx_to_cpuperf_target[] = {
+	[0] = SCX_CPUPERF_ONE * 0 / 4,
+	[1] = SCX_CPUPERF_ONE * 1 / 4,
+	[2] = SCX_CPUPERF_ONE * 2 / 4,
+	[3] = SCX_CPUPERF_ONE * 3 / 4,
+	[4] = SCX_CPUPERF_ONE * 4 / 4,
+};
+
 /*
  * Per-queue sequence numbers to implement core-sched ordering.
  *
@@ -95,6 +107,8 @@ struct {
 struct cpu_ctx {
 	u64	dsp_idx;	/* dispatch index */
 	u64	dsp_cnt;	/* remaining count */
+	u32	avg_weight;
+	u32	cpuperf_target;
 };
 
 struct {
@@ -107,6 +121,8 @@ struct {
 /* Statistics */
 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
 u64 nr_core_sched_execed;
+u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -313,6 +329,29 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
+{
+	struct cpu_ctx *cpuc;
+	u32 zero = 0;
+	int idx;
+
+	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
+		scx_bpf_error("failed to look up cpu_ctx");
+		return;
+	}
+
+	/*
+	 * Use the running avg of weights to select the target cpuperf level.
+	 * This is a demonstration of the cpuperf feature rather than a
+	 * practical strategy to regulate CPU frequency.
+	 */
+	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
+	idx = weight_to_idx(cpuc->avg_weight);
+	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
+
+	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
+}
+
 /*
  * The distance from the head of the queue scaled by the weight of the queue.
  * The lower the number, the older the task and the higher the priority.
@@ -422,8 +461,9 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
 	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
 		return;
 
-	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
-		     cpuc->dsp_idx, cpuc->dsp_cnt);
+	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
+		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
+		     cpuc->cpuperf_target);
 }
 
 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
@@ -492,11 +532,106 @@ void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
 	print_cpus();
 }
 
+struct monitor_timer {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct monitor_timer);
+} monitor_timer SEC(".maps");
+
+/*
+ * Print out the min, avg and max performance levels of CPUs every second to
+ * demonstrate the cpuperf interface.
+ */
+static void monitor_cpuperf(void)
+{
+	u32 zero = 0, nr_cpu_ids;
+	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
+	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
+	const struct cpumask *online;
+	int i, nr_online_cpus = 0;
+
+	nr_cpu_ids = scx_bpf_nr_cpu_ids();
+	online = scx_bpf_get_online_cpumask();
+
+	bpf_for(i, 0, nr_cpu_ids) {
+		struct cpu_ctx *cpuc;
+		u32 cap, cur;
+
+		if (!bpf_cpumask_test_cpu(i, online))
+			continue;
+		nr_online_cpus++;
+
+		/* collect the capacity and current cpuperf */
+		cap = scx_bpf_cpuperf_cap(i);
+		cur = scx_bpf_cpuperf_cur(i);
+
+		cur_min = cur < cur_min ? cur : cur_min;
+		cur_max = cur > cur_max ? cur : cur_max;
+
+		/*
+		 * $cur is relative to $cap. Scale it down accordingly so that
+		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
+		 * makes sense.
+		 */
+		cur_sum += cur * cap / SCX_CPUPERF_ONE;
+		cap_sum += cap;
+
+		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
+			scx_bpf_error("failed to look up cpu_ctx");
+			goto out;
+		}
+
+		/* collect target */
+		cur = cpuc->cpuperf_target;
+		target_sum += cur;
+		target_min = cur < target_min ? cur : target_min;
+		target_max = cur > target_max ? cur : target_max;
+	}
+
+	cpuperf_min = cur_min;
+	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+	cpuperf_max = cur_max;
+
+	cpuperf_target_min = target_min;
+	cpuperf_target_avg = target_sum / nr_online_cpus;
+	cpuperf_target_max = target_max;
+out:
+	scx_bpf_put_cpumask(online);
+}
+
+static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+	monitor_cpuperf();
+
+	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+	return 0;
+}
+
 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 {
+	u32 key = 0;
+	struct bpf_timer *timer;
+	s32 ret;
+
 	print_cpus();
 
-	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	timer = bpf_map_lookup_elem(&monitor_timer, &key);
+	if (!timer)
+		return -ESRCH;
+
+	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(timer, monitor_timerfn);
+
+	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 }
 
 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
@@ -509,6 +644,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .enqueue			= (void *)qmap_enqueue,
 	       .dequeue			= (void *)qmap_dequeue,
 	       .dispatch		= (void *)qmap_dispatch,
+	       .tick			= (void *)qmap_tick,
 	       .core_sched_before	= (void *)qmap_core_sched_before,
 	       .cpu_release		= (void *)qmap_cpu_release,
 	       .init_task		= (void *)qmap_init_task,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index bc36ec4f88a7..4d41c0cb1dab 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -116,6 +116,14 @@ int main(int argc, char **argv)
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
 		       skel->bss->nr_core_sched_execed);
+		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
+			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
+			       skel->bss->cpuperf_min,
+			       skel->bss->cpuperf_avg,
+			       skel->bss->cpuperf_max,
+			       skel->bss->cpuperf_target_min,
+			       skel->bss->cpuperf_target_avg,
+			       skel->bss->cpuperf_target_max);
 		fflush(stdout);
 		sleep(1);
 	}
-- 
cgit v1.2.3


From f97dcd0fcf7a95aaf448a9d1a7ed6c95e16dfcdb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Thu, 27 Jun 2024 18:11:32 +0100
Subject: sched_ext: Fix spelling mistake: "intead" -> "instead"

There is a spelling mistake in the help text. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 4d41c0cb1dab..e4e3ecffc4cf 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -31,7 +31,7 @@ const char help_fmt[] =
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -S            Suppress qmap-specific debug dump\n"
-"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
+"  -p            Switch only tasks on SCHED_EXT policy instead of all\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
 
-- 
cgit v1.2.3


From 6203ef73fa5c0358f7960b038628259be1448724 Mon Sep 17 00:00:00 2001
From: Hongyan Xia <hongyan.xia2@arm.com>
Date: Mon, 8 Jul 2024 15:01:18 +0100
Subject: sched/ext: Add BPF function to fetch rq

rq contains many useful fields to implement a custom scheduler. For
example, various clock signals like clock_task and clock_pelt can be
used to track load. It also contains stats in other sched_classes, which
are useful to drive scheduling decisions in ext.

tj: Put the new helper below scx_bpf_task_*() helpers.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c                       | 13 +++++++++++++
 tools/sched_ext/include/scx/common.bpf.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'tools')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 33bfb5a88b48..525102f3ff5b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6113,6 +6113,18 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
 	return task_cpu(p);
 }
 
+/**
+ * scx_bpf_cpu_rq - Fetch the rq of a CPU
+ * @cpu: CPU of the rq
+ */
+__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
+{
+	if (!ops_cpu_valid(cpu, NULL))
+		return NULL;
+
+	return cpu_rq(cpu);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -6137,6 +6149,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
 BTF_KFUNCS_END(scx_kfunc_ids_any)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index dbbda0e35c5d..965d20324114 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -57,6 +57,7 @@ s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
 
 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
-- 
cgit v1.2.3


From 650ba21b131ed1f8ee57826b2c6295a3be221132 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jul 2024 14:30:55 -1000
Subject: sched_ext: Implement DSQ iterator

DSQs are very opaque in the consumption path. The BPF scheduler has no way
of knowing which tasks are being considered and which is picked. This patch
adds BPF DSQ iterator.

- Allows iterating tasks queued on a DSQ in the dispatch order or reverse
  from anywhere using bpf_for_each(scx_dsq) or calling the iterator kfuncs
  directly.

- Has ordering guarantee where only tasks which were already queued when the
  iteration started are visible and consumable during the iteration.

v5: - Add a comment to the naked list_empty(&dsq->list) test in
      consume_dispatch_q() to explain the reasoning behind the lockless test
      and by extension why nldsq_next_task() isn't used there.

    - scx_qmap changes separated into its own patch.

v4: - bpf_iter_scx_dsq_new() declaration in common.bpf.h was using the wrong
      type for the last argument (bool rev instead of u64 flags). Fix it.

v3: - Alexei pointed out that the iterator is too big to allocate on stack.
      Added a prep patch to reduce the size of the cursor. Now
      bpf_iter_scx_dsq is 48 bytes and bpf_iter_scx_dsq_kern is 40 bytes on
      64bit.

    - u32_before() comparison factored out.

v2: - scx_bpf_consume_task() is separated out into a separate patch.

    - DSQ seq and iter flags don't need to be u64. Use u32.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: bpf@vger.kernel.org
---
 include/linux/sched/ext.h                |   3 +
 kernel/sched/ext.c                       | 192 ++++++++++++++++++++++++++++++-
 tools/sched_ext/include/scx/common.bpf.h |   3 +
 3 files changed, 196 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index eb9cfd18a923..593d2f4909dd 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -61,6 +61,7 @@ struct scx_dispatch_q {
 	struct list_head	list;	/* tasks in dispatch order */
 	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
 	u32			nr;
+	u32			seq;	/* used by BPF iter */
 	u64			id;
 	struct rhash_head	hash_node;
 	struct llist_node	free_node;
@@ -123,6 +124,7 @@ enum scx_kf_mask {
 
 struct scx_dsq_list_node {
 	struct list_head	node;
+	bool			is_bpf_iter_cursor;
 };
 
 /*
@@ -133,6 +135,7 @@ struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
 	struct scx_dsq_list_node dsq_list;	/* dispatch order */
 	struct rb_node		dsq_priq;	/* p->scx.dsq_vtime order */
+	u32			dsq_seq;
 	u32			dsq_flags;	/* protected by DSQ lock */
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 069c2f33883c..f16d72d72635 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -926,6 +926,11 @@ static u32 highest_bit(u32 flags)
 	return ((u64)1 << bit) >> 1;
 }
 
+static bool u32_before(u32 a, u32 b)
+{
+	return (s32)(a - b) < 0;
+}
+
 /*
  * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
  * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -1066,6 +1071,73 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
 	return true;
 }
 
+/**
+ * nldsq_next_task - Iterate to the next task in a non-local DSQ
+ * @dsq: user dsq being interated
+ * @cur: current position, %NULL to start iteration
+ * @rev: walk backwards
+ *
+ * Returns %NULL when iteration is finished.
+ */
+static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
+					   struct task_struct *cur, bool rev)
+{
+	struct list_head *list_node;
+	struct scx_dsq_list_node *dsq_lnode;
+
+	lockdep_assert_held(&dsq->lock);
+
+	if (cur)
+		list_node = &cur->scx.dsq_list.node;
+	else
+		list_node = &dsq->list;
+
+	/* find the next task, need to skip BPF iteration cursors */
+	do {
+		if (rev)
+			list_node = list_node->prev;
+		else
+			list_node = list_node->next;
+
+		if (list_node == &dsq->list)
+			return NULL;
+
+		dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
+					 node);
+	} while (dsq_lnode->is_bpf_iter_cursor);
+
+	return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
+}
+
+#define nldsq_for_each_task(p, dsq)						\
+	for ((p) = nldsq_next_task((dsq), NULL, false); (p);			\
+	     (p) = nldsq_next_task((dsq), (p), false))
+
+
+/*
+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
+ * dispatch order. BPF-visible iterator is opaque and larger to allow future
+ * changes without breaking backward compatibility. Can be used with
+ * bpf_for_each(). See bpf_iter_scx_dsq_*().
+ */
+enum scx_dsq_iter_flags {
+	/* iterate in the reverse dispatch order */
+	SCX_DSQ_ITER_REV		= 1U << 0,
+
+	__SCX_DSQ_ITER_ALL_FLAGS	= SCX_DSQ_ITER_REV,
+};
+
+struct bpf_iter_scx_dsq_kern {
+	struct scx_dsq_list_node	cursor;
+	struct scx_dispatch_q		*dsq;
+	u32				dsq_seq;
+	u32				flags;
+} __attribute__((aligned(8)));
+
+struct bpf_iter_scx_dsq {
+	u64				__opaque[6];
+} __attribute__((aligned(8)));
+
 
 /*
  * SCX task iterator.
@@ -1415,7 +1487,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 		 * tested easily when adding the first task.
 		 */
 		if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
-			     !list_empty(&dsq->list)))
+			     nldsq_next_task(dsq, NULL, false)))
 			scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
 				      dsq->id);
 
@@ -1447,6 +1519,10 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 			list_add_tail(&p->scx.dsq_list.node, &dsq->list);
 	}
 
+	/* seq records the order tasks are queued, used by BPF DSQ iterator */
+	dsq->seq++;
+	p->scx.dsq_seq = dsq->seq;
+
 	dsq_mod_nr(dsq, 1);
 	p->scx.dsq = dsq;
 
@@ -2104,12 +2180,17 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
 {
 	struct task_struct *p;
 retry:
+	/*
+	 * The caller can't expect to successfully consume a task if the task's
+	 * addition to @dsq isn't guaranteed to be visible somehow. Test
+	 * @dsq->list without locking and skip if it seems empty.
+	 */
 	if (list_empty(&dsq->list))
 		return false;
 
 	raw_spin_lock(&dsq->lock);
 
-	list_for_each_entry(p, &dsq->list, scx.dsq_list.node) {
+	nldsq_for_each_task(p, dsq) {
 		struct rq *task_rq = task_rq(p);
 
 		if (rq == task_rq) {
@@ -5705,6 +5786,110 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
 	destroy_dsq(dsq_id);
 }
 
+/**
+ * bpf_iter_scx_dsq_new - Create a DSQ iterator
+ * @it: iterator to initialize
+ * @dsq_id: DSQ to iterate
+ * @flags: %SCX_DSQ_ITER_*
+ *
+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
+ * tasks which are already queued when this function is invoked.
+ */
+__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
+				     u64 flags)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
+		     sizeof(struct bpf_iter_scx_dsq));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
+		     __alignof__(struct bpf_iter_scx_dsq));
+
+	if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
+		return -EINVAL;
+
+	kit->dsq = find_non_local_dsq(dsq_id);
+	if (!kit->dsq)
+		return -ENOENT;
+
+	INIT_LIST_HEAD(&kit->cursor.node);
+	kit->cursor.is_bpf_iter_cursor = true;
+	kit->dsq_seq = READ_ONCE(kit->dsq->seq);
+	kit->flags = flags;
+
+	return 0;
+}
+
+/**
+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator
+ * @it: iterator to progress
+ *
+ * Return the next task. See bpf_iter_scx_dsq_new().
+ */
+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+	bool rev = kit->flags & SCX_DSQ_ITER_REV;
+	struct task_struct *p;
+	unsigned long flags;
+
+	if (!kit->dsq)
+		return NULL;
+
+	raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+
+	if (list_empty(&kit->cursor.node))
+		p = NULL;
+	else
+		p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
+
+	/*
+	 * Only tasks which were queued before the iteration started are
+	 * visible. This bounds BPF iterations and guarantees that vtime never
+	 * jumps in the other direction while iterating.
+	 */
+	do {
+		p = nldsq_next_task(kit->dsq, p, rev);
+	} while (p && unlikely(u32_before(kit->dsq_seq, p->scx.dsq_seq)));
+
+	if (p) {
+		if (rev)
+			list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
+		else
+			list_move(&kit->cursor.node, &p->scx.dsq_list.node);
+	} else {
+		list_del_init(&kit->cursor.node);
+	}
+
+	raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+
+	return p;
+}
+
+/**
+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
+ * @it: iterator to destroy
+ *
+ * Undo scx_iter_scx_dsq_new().
+ */
+__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+	if (!kit->dsq)
+		return;
+
+	if (!list_empty(&kit->cursor.node)) {
+		unsigned long flags;
+
+		raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+		list_del_init(&kit->cursor.node);
+		raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+	}
+	kit->dsq = NULL;
+}
+
 __bpf_kfunc_end_defs();
 
 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
@@ -6138,6 +6323,9 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 965d20324114..20280df62857 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -39,6 +39,9 @@ u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
+struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
+void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
 void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
 void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
-- 
cgit v1.2.3


From 6fbd643318a1a5f3caea7f94bfe035efbb293ddb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jul 2024 14:30:55 -1000
Subject: sched_ext/scx_qmap: Add an example usage of DSQ iterator

Implement periodic dumping of the shared DSQ to demonstrate the use of the
newly added DSQ iterator.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: bpf@vger.kernel.org
---
 tools/sched_ext/scx_qmap.bpf.c | 25 +++++++++++++++++++++++++
 tools/sched_ext/scx_qmap.c     |  8 ++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index b1d0b09c966e..27e35066a602 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -36,6 +36,7 @@ const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
 const volatile u32 dsp_batch;
+const volatile bool print_shared_dsq;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
 
@@ -604,10 +605,34 @@ out:
 	scx_bpf_put_cpumask(online);
 }
 
+/*
+ * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
+ * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
+ * see meaningful dumps in the trace pipe.
+ */
+static void dump_shared_dsq(void)
+{
+	struct task_struct *p;
+	s32 nr;
+
+	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
+		return;
+
+	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
+
+	bpf_rcu_read_lock();
+	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
+		bpf_printk("%s[%d]", p->comm, p->pid);
+	bpf_rcu_read_unlock();
+}
+
 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
 	monitor_cpuperf();
 
+	if (print_shared_dsq)
+		dump_shared_dsq();
+
 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 	return 0;
 }
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index e4e3ecffc4cf..304f0488a386 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-"       [-d PID] [-D LEN] [-p] [-v]\n"
+"       [-P] [-d PID] [-D LEN] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,6 +28,7 @@ const char help_fmt[] =
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
+"  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -S            Suppress qmap-specific debug dump\n"
@@ -62,7 +63,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:d:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -82,6 +83,9 @@ int main(int argc, char **argv)
 		case 'b':
 			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
 			break;
+		case 'P':
+			skel->rodata->print_shared_dsq = true;
+			break;
 		case 'd':
 			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 			if (skel->rodata->disallow_tgid < 0)
-- 
cgit v1.2.3


From 1edab907b57d42e2dcf4c16a00185a89209e8700 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 12 Jul 2024 08:20:33 -1000
Subject: sched_ext/scx_qmap: Pick idle CPU for direct dispatch on !wakeup
 enqueues

Because there was no way to directly dispatch to the local DSQ of a remote
CPU from ops.enqueue(), scx_qmap skipped looking for an idle CPU on !wakeup
enqueues. This restriction was removed and sched_ext now allows
SCX_DSQ_LOCAL_ON verdicts for direct dispatches.

Factor out pick_direct_dispatch_cpu() from ops.select_cpu() and use it to
direct dispatch from ops.enqueue() on !wakeup enqueues.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
Cc: Changwoo Min <changwoo@igalia.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
---
 tools/sched_ext/scx_qmap.bpf.c | 39 ++++++++++++++++++++++++++++++---------
 tools/sched_ext/scx_qmap.c     |  5 +++--
 2 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 27e35066a602..892278f12dce 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -120,11 +120,26 @@ struct {
 } cpu_ctx_stor SEC(".maps");
 
 /* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
 u64 nr_core_sched_execed;
 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
 
+static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
+{
+	s32 cpu;
+
+	if (p->nr_cpus_allowed == 1 ||
+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+		return prev_cpu;
+
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+	if (cpu >= 0)
+		return cpu;
+
+	return -1;
+}
+
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@@ -137,17 +152,14 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		return -ESRCH;
 	}
 
-	if (p->nr_cpus_allowed == 1 ||
-	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
+
+	if (cpu >= 0) {
 		tctx->force_local = true;
+		return cpu;
+	} else {
 		return prev_cpu;
 	}
-
-	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-	if (cpu >= 0)
-		return cpu;
-
-	return prev_cpu;
 }
 
 static int weight_to_idx(u32 weight)
@@ -172,6 +184,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
+	s32 cpu;
 
 	if (p->flags & PF_KTHREAD) {
 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
@@ -207,6 +220,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */
+	if (!(enq_flags & SCX_ENQ_WAKEUP) &&
+	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
+		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
+		return;
+	}
+
 	/*
 	 * If the task was re-enqueued due to the CPU being preempted by a
 	 * higher priority scheduling class, just re-enqueue the task directly
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 304f0488a386..c9ca30d62b2b 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -116,10 +116,11 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64"\n",
+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
-		       skel->bss->nr_core_sched_execed);
+		       skel->bss->nr_core_sched_execed,
+		       skel->bss->nr_ddsp_from_enq);
 		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
 			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
 			       skel->bss->cpuperf_min,
-- 
cgit v1.2.3


From dc86460e77e8cea4896ff19de905001922653311 Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <zhouzhouyi@gmail.com>
Date: Wed, 19 Jun 2024 23:06:58 +0000
Subject: rcutorture: Add CFcommon.arch for arch-specific Kconfig options

Add CFcommon.arch for arch-specific Kconfig options.

In accordance with [1], [2] and [3], move the x86-specific kernel option
CONFIG_HYPERVISOR_GUEST to CFcommon.i686 and CFcommon.x86_64, and also
move the x86/PowerPC CONFIG_KVM_GUEST Kconfig option to CFcommon.i686,
CFcommon.x86_64, and CFcommon.ppc64le.

The "arch" in CFcommon.arch is taken from the "uname -m" command.

[1] https://lore.kernel.org/all/20240427005626.1365935-1-zhouzhouyi@gmail.com/
[2] https://lore.kernel.org/all/059d36ce-6453-42be-a31e-895abd35d590@paulmck-laptop/
[3] https://lore.kernel.org/all/ZnBkHosMDhsh4H8g@J2N7QTR9R3/

Tested in x86_64 and PPC VM of Open Source Lab of Oregon State University.

Fixes: a6fda6dab93c ("rcutorture: Tweak kvm options")
Suggested-by: Paul E. McKenney <paulmck@kernel.org>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh        | 2 ++
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon         | 2 --
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686    | 2 ++
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le | 1 +
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64  | 2 ++
 5 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index b33cd8753689..ad79784e552d 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -68,6 +68,8 @@ config_override_param "--gdb options" KcList "$TORTURE_KCONFIG_GDB_ARG"
 config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG"
 config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG"
 config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG"
+config_override_param "$config_dir/CFcommon.$(uname -m)" KcList \
+		      "`cat $config_dir/CFcommon.$(uname -m) 2> /dev/null`"
 cp $T/KcList $resdir/ConfigFragment
 
 base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'`
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index 0e92d85313aa..217597e84905 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,7 +1,5 @@
 CONFIG_RCU_TORTURE_TEST=y
 CONFIG_PRINTK_TIME=y
-CONFIG_HYPERVISOR_GUEST=y
 CONFIG_PARAVIRT=y
-CONFIG_KVM_GUEST=y
 CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n
 CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686
new file mode 100644
index 000000000000..d8b2f555686f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686
@@ -0,0 +1,2 @@
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_KVM_GUEST=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le
new file mode 100644
index 000000000000..133da04247ee
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le
@@ -0,0 +1 @@
+CONFIG_KVM_GUEST=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64
new file mode 100644
index 000000000000..d8b2f555686f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64
@@ -0,0 +1,2 @@
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_KVM_GUEST=y
-- 
cgit v1.2.3


From 9a13a324f40fc3846cf7867daffaccd785beb10c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jul 2024 19:51:02 -0700
Subject: tools/rcu: Remove RCU Tasks Rude asynchronous APIs from
 rcu-updaters.sh

The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are no longer.
This commit therefore removes them from the rcu-updaters.sh script.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 tools/rcu/rcu-updaters.sh | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/rcu/rcu-updaters.sh b/tools/rcu/rcu-updaters.sh
index 4ef1397927bb..8a5df3f22550 100755
--- a/tools/rcu/rcu-updaters.sh
+++ b/tools/rcu/rcu-updaters.sh
@@ -21,12 +21,10 @@ fi
 bpftrace -e 'kprobe:kvfree_call_rcu,
 	     kprobe:call_rcu,
 	     kprobe:call_rcu_tasks,
-	     kprobe:call_rcu_tasks_rude,
 	     kprobe:call_rcu_tasks_trace,
 	     kprobe:call_srcu,
 	     kprobe:rcu_barrier,
 	     kprobe:rcu_barrier_tasks,
-	     kprobe:rcu_barrier_tasks_rude,
 	     kprobe:rcu_barrier_tasks_trace,
 	     kprobe:srcu_barrier,
 	     kprobe:synchronize_rcu,
-- 
cgit v1.2.3


From 6ea2987c9a7b6c5f37d08a3eaa664c9ff7467670 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Thu, 25 Jul 2024 18:54:18 +0200
Subject: tools/nolibc: include arch.h from string.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

string.h tests for the macros NOLIBC_ARCH_HAS_$FUNC to use the
architecture-optimized function variants.
However if string.h is included before arch.h header then that check
does not work, leading to duplicate function definitions.

Fixes: 553845eebd60 ("tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`")
Fixes: 12108aa8c1a1 ("tools/nolibc: x86-64: Use `rep stosb` for `memset()`")
Cc: stable@vger.kernel.org
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240725-arch-has-func-v1-1-5521ed354acd@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/string.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h
index f9ab28421e6d..9ec9c24f38c0 100644
--- a/tools/include/nolibc/string.h
+++ b/tools/include/nolibc/string.h
@@ -7,6 +7,7 @@
 #ifndef _NOLIBC_STRING_H
 #define _NOLIBC_STRING_H
 
+#include "arch.h"
 #include "std.h"
 
 static void *malloc(size_t len);
-- 
cgit v1.2.3


From ae1f550efc11eaf1496c431d9c6e784cb49124c5 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Thu, 25 Jul 2024 19:10:44 +0200
Subject: tools/nolibc: add stdbool.h header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stdbool.h is very simple.
Provide an implementation for the user convenience.

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240725-nolibc-stdbool-v1-1-a6ee2c80bcde@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/Makefile  |  1 +
 tools/include/nolibc/nolibc.h  |  3 ++-
 tools/include/nolibc/stdbool.h | 16 ++++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 tools/include/nolibc/stdbool.h

(limited to 'tools')

diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile
index e69c26abe1ea..a1f55fb24bb3 100644
--- a/tools/include/nolibc/Makefile
+++ b/tools/include/nolibc/Makefile
@@ -35,6 +35,7 @@ all_files := \
 		stackprotector.h \
 		std.h \
 		stdarg.h \
+		stdbool.h \
 		stdint.h \
 		stdlib.h \
 		string.h \
diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 989e707263a4..92436b1e4441 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -74,7 +74,8 @@
  *            -I../nolibc -o hello hello.c -lgcc
  *
  * The available standard (but limited) include files are:
- *   ctype.h, errno.h, signal.h, stdarg.h, stdio.h, stdlib.h, string.h, time.h
+ *   ctype.h, errno.h, signal.h, stdarg.h, stdbool.h stdio.h, stdlib.h,
+ *   string.h, time.h
  *
  * In addition, the following ones are expected to be provided by the compiler:
  *   float.h, stddef.h
diff --git a/tools/include/nolibc/stdbool.h b/tools/include/nolibc/stdbool.h
new file mode 100644
index 000000000000..60feece22f17
--- /dev/null
+++ b/tools/include/nolibc/stdbool.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * Boolean types support for NOLIBC
+ * Copyright (C) 2024 Thomas Weißschuh <linux@weissschuh.net>
+ */
+
+#ifndef _NOLIBC_STDBOOL_H
+#define _NOLIBC_STDBOOL_H
+
+#define bool _Bool
+#define true 1
+#define false 0
+
+#define __bool_true_false_are_defined 1
+
+#endif /* _NOLIBC_STDBOOL_H */
-- 
cgit v1.2.3


From a63507f3b11de30fd7711ec244fe354fb4a01a09 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Thu, 18 Jul 2024 14:22:29 +0800
Subject: selftests/bpf: Drop type of connect_to_fd_opts

The "type" parameter of connect_to_fd_opts() is redundant of "server_fd".
Since the "type" can be obtained inside by invoking getsockopt(SO_TYPE),
without passing it in as a parameter.

This patch drops the "type" parameter of connect_to_fd_opts() and updates
its callers.

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Link: https://lore.kernel.org/r/50d8ce7ab7ab0c0f4d211fc7cc4ebe3d3f63424c.1721282219.git.tanggeliang@kylinos.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.c       | 21 +++++++++++----------
 tools/testing/selftests/bpf/network_helpers.h       |  2 +-
 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c |  2 +-
 .../testing/selftests/bpf/prog_tests/cgroup_v1v2.c  |  4 ++--
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index e0cba4178e41..15e0e0bb7553 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -328,14 +328,21 @@ error_close:
 	return -1;
 }
 
-int connect_to_fd_opts(int server_fd, int type, const struct network_helper_opts *opts)
+int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts)
 {
 	struct sockaddr_storage addr;
-	socklen_t addrlen;
+	socklen_t addrlen, optlen;
+	int type;
 
 	if (!opts)
 		opts = &default_opts;
 
+	optlen = sizeof(type);
+	if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) {
+		log_err("getsockopt(SOL_TYPE)");
+		return -1;
+	}
+
 	addrlen = sizeof(addr);
 	if (getsockname(server_fd, (struct sockaddr *)&addr, &addrlen)) {
 		log_err("Failed to get server addr");
@@ -350,14 +357,8 @@ int connect_to_fd(int server_fd, int timeout_ms)
 	struct network_helper_opts opts = {
 		.timeout_ms = timeout_ms,
 	};
-	int type, protocol;
 	socklen_t optlen;
-
-	optlen = sizeof(type);
-	if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) {
-		log_err("getsockopt(SOL_TYPE)");
-		return -1;
-	}
+	int protocol;
 
 	optlen = sizeof(protocol);
 	if (getsockopt(server_fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &optlen)) {
@@ -366,7 +367,7 @@ int connect_to_fd(int server_fd, int timeout_ms)
 	}
 	opts.proto = protocol;
 
-	return connect_to_fd_opts(server_fd, type, &opts);
+	return connect_to_fd_opts(server_fd, &opts);
 }
 
 int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms)
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index aac5b94d6379..5b548c0c60de 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -71,7 +71,7 @@ int client_socket(int family, int type,
 int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len,
 		    const struct network_helper_opts *opts);
 int connect_to_fd(int server_fd, int timeout_ms);
-int connect_to_fd_opts(int server_fd, int type, const struct network_helper_opts *opts);
+int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts);
 int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
 int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
 		     int timeout_ms);
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index 63422f4f3896..1d494b4453f4 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -49,7 +49,7 @@ static bool start_test(char *addr_str,
 		goto err;
 
 	/* connect to server */
-	*cli_fd = connect_to_fd_opts(*srv_fd, SOCK_STREAM, cli_opts);
+	*cli_fd = connect_to_fd_opts(*srv_fd, cli_opts);
 	if (!ASSERT_NEQ(*cli_fd, -1, "connect_to_fd_opts"))
 		goto err;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c
index 9709c8db7275..addf720428f7 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c
@@ -32,7 +32,7 @@ static int run_test(int cgroup_fd, int server_fd, bool classid)
 		goto out;
 	}
 
-	fd = connect_to_fd_opts(server_fd, SOCK_STREAM, &opts);
+	fd = connect_to_fd_opts(server_fd, &opts);
 	if (fd < 0)
 		err = -1;
 	else
@@ -52,7 +52,7 @@ void test_cgroup_v1v2(void)
 	server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0);
 	if (!ASSERT_GE(server_fd, 0, "server_fd"))
 		return;
-	client_fd = connect_to_fd_opts(server_fd, SOCK_STREAM, &opts);
+	client_fd = connect_to_fd_opts(server_fd, &opts);
 	if (!ASSERT_GE(client_fd, 0, "client_fd")) {
 		close(server_fd);
 		return;
-- 
cgit v1.2.3


From e1ee5a48b5b27e3e7bb294f80f7429c3d0466d19 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Thu, 18 Jul 2024 14:22:30 +0800
Subject: selftests/bpf: Drop must_fail from network_helper_opts

The struct member "must_fail" of network_helper_opts() is only used in
cgroup_v1v2 tests, it makes sense to drop it from network_helper_opts.

Return value (fd) of connect_to_fd_opts() and the expect errno (EPERM)
can be checked in cgroup_v1v2.c directly, no need to check them in
connect_fd_to_addr() in network_helpers.c.

This also makes connect_fd_to_addr() function useless. It can be replaced
by connect().

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Link: https://lore.kernel.org/r/3faf336019a9a48e2e8951f4cdebf19e3ac6e441.1721282219.git.tanggeliang@kylinos.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.c      | 42 +++++-----------------
 tools/testing/selftests/bpf/network_helpers.h      |  1 -
 .../testing/selftests/bpf/prog_tests/cgroup_v1v2.c | 14 ++++----
 3 files changed, 16 insertions(+), 41 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 15e0e0bb7553..bc4947afa0b4 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -277,33 +277,6 @@ error_close:
 	return -1;
 }
 
-static int connect_fd_to_addr(int fd,
-			      const struct sockaddr_storage *addr,
-			      socklen_t addrlen, const bool must_fail)
-{
-	int ret;
-
-	errno = 0;
-	ret = connect(fd, (const struct sockaddr *)addr, addrlen);
-	if (must_fail) {
-		if (!ret) {
-			log_err("Unexpected success to connect to server");
-			return -1;
-		}
-		if (errno != EPERM) {
-			log_err("Unexpected error from connect to server");
-			return -1;
-		}
-	} else {
-		if (ret) {
-			log_err("Failed to connect to server");
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
 int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen,
 		    const struct network_helper_opts *opts)
 {
@@ -318,14 +291,13 @@ int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t add
 		return -1;
 	}
 
-	if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail))
-		goto error_close;
+	if (connect(fd, (const struct sockaddr *)addr, addrlen)) {
+		log_err("Failed to connect to server");
+		save_errno_close(fd);
+		return -1;
+	}
 
 	return fd;
-
-error_close:
-	save_errno_close(fd);
-	return -1;
 }
 
 int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts)
@@ -383,8 +355,10 @@ int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms)
 		return -1;
 	}
 
-	if (connect_fd_to_addr(client_fd, &addr, len, false))
+	if (connect(client_fd, (const struct sockaddr *)&addr, len)) {
+		log_err("Failed to connect to server");
 		return -1;
+	}
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 5b548c0c60de..f39eeb5a4594 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -23,7 +23,6 @@ typedef __u16 __sum16;
 
 struct network_helper_opts {
 	int timeout_ms;
-	bool must_fail;
 	int proto;
 	/* +ve: Passed to listen() as-is.
 	 *   0: Default when the test does not set
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c
index addf720428f7..64abba72ac10 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c
@@ -9,9 +9,6 @@
 
 static int run_test(int cgroup_fd, int server_fd, bool classid)
 {
-	struct network_helper_opts opts = {
-		.must_fail = true,
-	};
 	struct connect4_dropper *skel;
 	int fd, err = 0;
 
@@ -32,11 +29,16 @@ static int run_test(int cgroup_fd, int server_fd, bool classid)
 		goto out;
 	}
 
-	fd = connect_to_fd_opts(server_fd, &opts);
-	if (fd < 0)
+	errno = 0;
+	fd = connect_to_fd_opts(server_fd, NULL);
+	if (fd >= 0) {
+		log_err("Unexpected success to connect to server");
 		err = -1;
-	else
 		close(fd);
+	} else if (errno != EPERM) {
+		log_err("Unexpected errno from connect to server");
+		err = -1;
+	}
 out:
 	connect4_dropper__destroy(skel);
 	return err;
-- 
cgit v1.2.3


From c70b2d9027ca39e84d4ee01da78a70308ce1fd4f Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Thu, 18 Jul 2024 14:22:31 +0800
Subject: selftests/bpf: Add connect_to_addr_str helper

Similar to connect_to_addr() helper for connecting to a server with the
given sockaddr_storage type address, this patch adds a new helper named
connect_to_addr_str() for connecting to a server with the given string
type address "addr_str", together with its "family" and "port" as other
parameters of connect_to_addr_str().

In connect_to_addr_str(), the parameters "family", "addr_str" and "port"
are used to create a sockaddr_storage type address "addr" by invoking
make_sockaddr(). Then pass this "addr" together with "addrlen", "type"
and "opts" to connect_to_addr().

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Link: https://lore.kernel.org/r/647e82170831558dbde132a7a3d86df660dba2c4.1721282219.git.tanggeliang@kylinos.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.c | 15 +++++++++++++++
 tools/testing/selftests/bpf/network_helpers.h |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index bc4947afa0b4..9c98a60cf1e2 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -300,6 +300,21 @@ int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t add
 	return fd;
 }
 
+int connect_to_addr_str(int family, int type, const char *addr_str, __u16 port,
+			const struct network_helper_opts *opts)
+{
+	struct sockaddr_storage addr;
+	socklen_t addrlen;
+
+	if (!opts)
+		opts = &default_opts;
+
+	if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
+		return -1;
+
+	return connect_to_addr(type, &addr, addrlen, opts);
+}
+
 int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts)
 {
 	struct sockaddr_storage addr;
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index f39eeb5a4594..cce56955371f 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -69,6 +69,8 @@ int client_socket(int family, int type,
 		  const struct network_helper_opts *opts);
 int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len,
 		    const struct network_helper_opts *opts);
+int connect_to_addr_str(int family, int type, const char *addr_str, __u16 port,
+			const struct network_helper_opts *opts);
 int connect_to_fd(int server_fd, int timeout_ms);
 int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts);
 int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
-- 
cgit v1.2.3


From 844f7315e77a017fe90652b411e88b119c782a1f Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Thu, 18 Jul 2024 22:57:43 +0000
Subject: selftests/bpf: Use auto-dependencies for test objects

Make use of -M compiler options when building .test.o objects to
generate .d files and avoid re-building all tests every time.

Previously, if a single test bpf program under selftests/bpf/progs/*.c
has changed, make would rebuild all the *.bpf.o, *.skel.h and *.test.o
objects, which is a lot of unnecessary work.

A typical dependency chain is:
progs/x.c -> x.bpf.o -> x.skel.h -> x.test.o -> trunner_binary

However for many tests it's not a 1:1 mapping by name, and so far
%.test.o have been simply dependent on all %.skel.h files, and
%.skel.h files on all %.bpf.o objects.

Avoid full rebuilds by instructing the compiler (via -MMD) to
produce *.d files with real dependencies, and appropriately including
them. Exploit make feature that rebuilds included makefiles if they
were changed by setting %.test.d as prerequisite for %.test.o files.

A couple of examples of compilation time speedup (after the first
clean build):

$ touch progs/verifier_and.c && time make -j8
Before: real	0m16.651s
After:  real	0m2.245s
$ touch progs/read_vsyscall.c && time make -j8
Before: real	0m15.743s
After:  real	0m1.575s

A drawback of this change is that now there is an overhead due to make
processing lots of .d files, which potentially may slow down unrelated
targets. However a time to make all from scratch hasn't changed
significantly:

$ make clean && time make -j8
Before: real	1m31.148s
After:  real	1m30.309s

Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/VJihUTnvtwEgv_mOnpfy7EgD9D2MPNoHO-MlANeLIzLJPGhDeyOuGKIYyKgk0O6KPjfM-MuhtvPwZcngN8WFqbTnTRyCSMc2aMZ1ODm1T_g=@pm.me
---
 tools/testing/selftests/bpf/.gitignore |  1 +
 tools/testing/selftests/bpf/Makefile   | 43 +++++++++++++++++++++++++---------
 2 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 5025401323af..4e4aae8aa7ec 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -31,6 +31,7 @@ test_tcp_check_syncookie_user
 test_sysctl
 xdping
 test_cpp
+*.d
 *.subskel.h
 *.skel.h
 *.lskel.h
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index dd49c1d23a60..05b234248b38 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -477,7 +477,8 @@ xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
 xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
 xdp_features.skel.h-deps := xdp_features.bpf.o
 
-LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
+LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps))
+LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS))
 
 # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on
 # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES.
@@ -556,7 +557,11 @@ $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
 	$(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@
 	$(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o)
 
-$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT)
+$(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/%
+
+# .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites
+.SECONDEXPANSION:
+$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_OUTPUT)/%: $$$$(%-deps) $(BPFTOOL) | $(TRUNNER_OUTPUT)
 	$$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o))
 	$(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps))
 	$(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o)
@@ -566,6 +571,14 @@ $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT)
 	$(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@
 	$(Q)$$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h)
 	$(Q)rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o)
+
+# When the compiler generates a %.d file, only skel basenames (not
+# full paths) are specified as prerequisites for corresponding %.o
+# file. This target makes %.skel.h basename dependent on full paths,
+# linking generated %.d dependency with actual %.skel.h files.
+$(notdir %.skel.h): $(TRUNNER_OUTPUT)/%.skel.h
+	@true
+
 endif
 
 # ensure we set up tests.h header generation rule just once
@@ -583,14 +596,19 @@ endif
 # Note: we cd into output directory to ensure embedded BPF object is found
 $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o:			\
 		      $(TRUNNER_TESTS_DIR)/%.c				\
-		      $(TRUNNER_EXTRA_HDRS)				\
-		      $(TRUNNER_BPF_OBJS)				\
-		      $(TRUNNER_BPF_SKELS)				\
-		      $(TRUNNER_BPF_LSKELS)				\
-		      $(TRUNNER_BPF_SKELS_LINKED)			\
-		      $$(BPFOBJ) | $(TRUNNER_OUTPUT)
+		      $(TRUNNER_OUTPUT)/%.test.d
 	$$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@)
-	$(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
+	$(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
+
+$(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d:			\
+			    $(TRUNNER_TESTS_DIR)/%.c			\
+			    $(TRUNNER_EXTRA_HDRS)			\
+			    $(TRUNNER_BPF_SKELS)			\
+			    $(TRUNNER_BPF_LSKELS)			\
+			    $(TRUNNER_BPF_SKELS_LINKED)			\
+			    $$(BPFOBJ) | $(TRUNNER_OUTPUT)
+
+include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d))
 
 $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o:				\
 		       %.c						\
@@ -608,6 +626,9 @@ ifneq ($2:$(OUTPUT),:$(shell pwd))
 	$(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/
 endif
 
+# some X.test.o files have runtime dependencies on Y.bpf.o files
+$(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS)
+
 $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS)			\
 			     $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ)		\
 			     $(RESOLVE_BTFIDS)				\
@@ -768,8 +789,8 @@ $(OUTPUT)/uprobe_multi: uprobe_multi.c
 
 EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)			\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
-	feature bpftool							\
-	$(addprefix $(OUTPUT)/,*.o *.skel.h *.lskel.h *.subskel.h	\
+	feature bpftool 						\
+	$(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h	\
 			       no_alu32 cpuv4 bpf_gcc bpf_testmod.ko	\
 			       bpf_test_no_cfi.ko			\
 			       liburandom_read.so)
-- 
cgit v1.2.3


From 4bf79f9be434e000c8e12fe83b2f4402480f1460 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 18 Jul 2024 13:23:53 -0700
Subject: bpf: Track equal scalars history on per-instruction level

Use bpf_verifier_state->jmp_history to track which registers were
updated by find_equal_scalars() (renamed to collect_linked_regs())
when conditional jump was verified. Use recorded information in
backtrack_insn() to propagate precision.

E.g. for the following program:

            while verifying instructions
  1: r1 = r0              |
  2: if r1 < 8  goto ...  | push r0,r1 as linked registers in jmp_history
  3: if r0 > 16 goto ...  | push r0,r1 as linked registers in jmp_history
  4: r2 = r10             |
  5: r2 += r0             v mark_chain_precision(r0)

            while doing mark_chain_precision(r0)
  5: r2 += r0             | mark r0 precise
  4: r2 = r10             |
  3: if r0 > 16 goto ...  | mark r0,r1 as precise
  2: if r1 < 8  goto ...  | mark r0,r1 as precise
  1: r1 = r0              v

Technically, do this as follows:
- Use 10 bits to identify each register that gains range because of
  sync_linked_regs():
  - 3 bits for frame number;
  - 6 bits for register or stack slot number;
  - 1 bit to indicate if register is spilled.
- Use u64 as a vector of 6 such records + 4 bits for vector length.
- Augment struct bpf_jmp_history_entry with a field 'linked_regs'
  representing such vector.
- When doing check_cond_jmp_op() remember up to 6 registers that
  gain range because of sync_linked_regs() in such a vector.
- Don't propagate range information and reset IDs for registers that
  don't fit in 6-value vector.
- Push a pair {instruction index, linked registers vector}
  to bpf_verifier_state->jmp_history.
- When doing backtrack_insn() check if any of recorded linked
  registers is currently marked precise, if so mark all linked
  registers as precise.

This also requires fixes for two test_verifier tests:
- precise: test 1
- precise: test 2

Both tests contain the following instruction sequence:

19: (bf) r2 = r9                      ; R2=scalar(id=3) R9=scalar(id=3)
20: (a5) if r2 < 0x8 goto pc+1        ; R2=scalar(id=3,umin=8)
21: (95) exit
22: (07) r2 += 1                      ; R2_w=scalar(id=3+1,...)
23: (bf) r1 = r10                     ; R1_w=fp0 R10=fp0
24: (07) r1 += -8                     ; R1_w=fp-8
25: (b7) r3 = 0                       ; R3_w=0
26: (85) call bpf_probe_read_kernel#113

The call to bpf_probe_read_kernel() at (26) forces r2 to be precise.
Previously, this forced all registers with same id to become precise
immediately when mark_chain_precision() is called.
After this change, the precision is propagated to registers sharing
same id only when 'if' instruction is backtracked.
Hence verification log for both tests is changed:
regs=r2,r9 -> regs=r2 for instructions 25..20.

Fixes: 904e6ddf4133 ("bpf: Use scalar ids in mark_chain_precision()")
Reported-by: Hao Sun <sunhao.th@gmail.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240718202357.1746514-2-eddyz87@gmail.com

Closes: https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/
---
 include/linux/bpf_verifier.h                       |   4 +
 kernel/bpf/verifier.c                              | 245 +++++++++++++++++++--
 .../bpf/progs/verifier_subprog_precision.c         |   2 +-
 tools/testing/selftests/bpf/verifier/precise.c     |  20 +-
 4 files changed, 239 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6503c85b10a3..731a0a4ac13c 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -371,6 +371,10 @@ struct bpf_jmp_history_entry {
 	u32 prev_idx : 22;
 	/* special flags, e.g., whether insn is doing register stack spill/load */
 	u32 flags : 10;
+	/* additional registers that need precision tracking when this
+	 * jump is backtracked, vector of six 10-bit records
+	 */
+	u64 linked_regs;
 };
 
 /* Maximum number of register states that can exist at once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4cb5441ad75f..8dade7b51430 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3335,9 +3335,87 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
 	return env->insn_aux_data[insn_idx].jmp_point;
 }
 
+#define LR_FRAMENO_BITS	3
+#define LR_SPI_BITS	6
+#define LR_ENTRY_BITS	(LR_SPI_BITS + LR_FRAMENO_BITS + 1)
+#define LR_SIZE_BITS	4
+#define LR_FRAMENO_MASK	((1ull << LR_FRAMENO_BITS) - 1)
+#define LR_SPI_MASK	((1ull << LR_SPI_BITS)     - 1)
+#define LR_SIZE_MASK	((1ull << LR_SIZE_BITS)    - 1)
+#define LR_SPI_OFF	LR_FRAMENO_BITS
+#define LR_IS_REG_OFF	(LR_SPI_BITS + LR_FRAMENO_BITS)
+#define LINKED_REGS_MAX	6
+
+struct linked_reg {
+	u8 frameno;
+	union {
+		u8 spi;
+		u8 regno;
+	};
+	bool is_reg;
+};
+
+struct linked_regs {
+	int cnt;
+	struct linked_reg entries[LINKED_REGS_MAX];
+};
+
+static struct linked_reg *linked_regs_push(struct linked_regs *s)
+{
+	if (s->cnt < LINKED_REGS_MAX)
+		return &s->entries[s->cnt++];
+
+	return NULL;
+}
+
+/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
+ * number of elements currently in stack.
+ * Pack one history entry for linked registers as 10 bits in the following format:
+ * - 3-bits frameno
+ * - 6-bits spi_or_reg
+ * - 1-bit  is_reg
+ */
+static u64 linked_regs_pack(struct linked_regs *s)
+{
+	u64 val = 0;
+	int i;
+
+	for (i = 0; i < s->cnt; ++i) {
+		struct linked_reg *e = &s->entries[i];
+		u64 tmp = 0;
+
+		tmp |= e->frameno;
+		tmp |= e->spi << LR_SPI_OFF;
+		tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF;
+
+		val <<= LR_ENTRY_BITS;
+		val |= tmp;
+	}
+	val <<= LR_SIZE_BITS;
+	val |= s->cnt;
+	return val;
+}
+
+static void linked_regs_unpack(u64 val, struct linked_regs *s)
+{
+	int i;
+
+	s->cnt = val & LR_SIZE_MASK;
+	val >>= LR_SIZE_BITS;
+
+	for (i = 0; i < s->cnt; ++i) {
+		struct linked_reg *e = &s->entries[i];
+
+		e->frameno =  val & LR_FRAMENO_MASK;
+		e->spi     = (val >> LR_SPI_OFF) & LR_SPI_MASK;
+		e->is_reg  = (val >> LR_IS_REG_OFF) & 0x1;
+		val >>= LR_ENTRY_BITS;
+	}
+}
+
 /* for any branch, call, exit record the history of jmps in the given state */
 static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
-			    int insn_flags)
+			    int insn_flags, u64 linked_regs)
 {
 	u32 cnt = cur->jmp_history_cnt;
 	struct bpf_jmp_history_entry *p;
@@ -3353,6 +3431,10 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
 			  "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
 			  env->insn_idx, env->cur_hist_ent->flags, insn_flags);
 		env->cur_hist_ent->flags |= insn_flags;
+		WARN_ONCE(env->cur_hist_ent->linked_regs != 0,
+			  "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n",
+			  env->insn_idx, env->cur_hist_ent->linked_regs);
+		env->cur_hist_ent->linked_regs = linked_regs;
 		return 0;
 	}
 
@@ -3367,6 +3449,7 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
 	p->idx = env->insn_idx;
 	p->prev_idx = env->prev_insn_idx;
 	p->flags = insn_flags;
+	p->linked_regs = linked_regs;
 	cur->jmp_history_cnt = cnt;
 	env->cur_hist_ent = p;
 
@@ -3532,6 +3615,11 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
 	return bt->reg_masks[bt->frame] & (1 << reg);
 }
 
+static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+	return bt->reg_masks[frame] & (1 << reg);
+}
+
 static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
 {
 	return bt->stack_masks[frame] & (1ull << slot);
@@ -3576,6 +3664,42 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
 	}
 }
 
+/* If any register R in hist->linked_regs is marked as precise in bt,
+ * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
+ */
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
+{
+	struct linked_regs linked_regs;
+	bool some_precise = false;
+	int i;
+
+	if (!hist || hist->linked_regs == 0)
+		return;
+
+	linked_regs_unpack(hist->linked_regs, &linked_regs);
+	for (i = 0; i < linked_regs.cnt; ++i) {
+		struct linked_reg *e = &linked_regs.entries[i];
+
+		if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) ||
+		    (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) {
+			some_precise = true;
+			break;
+		}
+	}
+
+	if (!some_precise)
+		return;
+
+	for (i = 0; i < linked_regs.cnt; ++i) {
+		struct linked_reg *e = &linked_regs.entries[i];
+
+		if (e->is_reg)
+			bt_set_frame_reg(bt, e->frameno, e->regno);
+		else
+			bt_set_frame_slot(bt, e->frameno, e->spi);
+	}
+}
+
 static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
 
 /* For given verifier state backtrack_insn() is called from the last insn to
@@ -3615,6 +3739,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 		print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
 	}
 
+	/* If there is a history record that some registers gained range at this insn,
+	 * propagate precision marks to those registers, so that bt_is_reg_set()
+	 * accounts for these registers.
+	 */
+	bt_sync_linked_regs(bt, hist);
+
 	if (class == BPF_ALU || class == BPF_ALU64) {
 		if (!bt_is_reg_set(bt, dreg))
 			return 0;
@@ -3844,7 +3974,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 			 */
 			bt_set_reg(bt, dreg);
 			bt_set_reg(bt, sreg);
-			 /* else dreg <cond> K
+		} else if (BPF_SRC(insn->code) == BPF_K) {
+			 /* dreg <cond> K
 			  * Only dreg still needs precision before
 			  * this insn, so for the K-based conditional
 			  * there is nothing new to be marked.
@@ -3862,6 +3993,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 			/* to be analyzed */
 			return -ENOTSUPP;
 	}
+	/* Propagate precision marks to linked registers, to account for
+	 * registers marked as precise in this function.
+	 */
+	bt_sync_linked_regs(bt, hist);
 	return 0;
 }
 
@@ -4456,7 +4591,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
 
 	if (!src_reg->id && !tnum_is_const(src_reg->var_off))
 		/* Ensure that src_reg has a valid ID that will be copied to
-		 * dst_reg and then will be used by find_equal_scalars() to
+		 * dst_reg and then will be used by sync_linked_regs() to
 		 * propagate min/max range.
 		 */
 		src_reg->id = ++env->id_gen;
@@ -4625,7 +4760,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	}
 
 	if (insn_flags)
-		return push_jmp_history(env, env->cur_state, insn_flags);
+		return push_jmp_history(env, env->cur_state, insn_flags, 0);
 	return 0;
 }
 
@@ -4930,7 +5065,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 		insn_flags = 0; /* we are not restoring spilled register */
 	}
 	if (insn_flags)
-		return push_jmp_history(env, env->cur_state, insn_flags);
+		return push_jmp_history(env, env->cur_state, insn_flags, 0);
 	return 0;
 }
 
@@ -14099,7 +14234,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		u64 val = reg_const_value(src_reg, alu32);
 
 		if ((dst_reg->id & BPF_ADD_CONST) ||
-		    /* prevent overflow in find_equal_scalars() later */
+		    /* prevent overflow in sync_linked_regs() later */
 		    val > (u32)S32_MAX) {
 			/*
 			 * If the register already went through rX += val
@@ -14114,7 +14249,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	} else {
 		/*
 		 * Make sure ID is cleared otherwise dst_reg min/max could be
-		 * incorrectly propagated into other registers by find_equal_scalars()
+		 * incorrectly propagated into other registers by sync_linked_regs()
 		 */
 		dst_reg->id = 0;
 	}
@@ -14264,7 +14399,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 						copy_register_state(dst_reg, src_reg);
 						/* Make sure ID is cleared if src_reg is not in u32
 						 * range otherwise dst_reg min/max could be incorrectly
-						 * propagated into src_reg by find_equal_scalars()
+						 * propagated into src_reg by sync_linked_regs()
 						 */
 						if (!is_src_reg_u32)
 							dst_reg->id = 0;
@@ -15087,14 +15222,66 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 	return true;
 }
 
-static void find_equal_scalars(struct bpf_verifier_state *vstate,
-			       struct bpf_reg_state *known_reg)
+static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg,
+				  u32 id, u32 frameno, u32 spi_or_reg, bool is_reg)
+{
+	struct linked_reg *e;
+
+	if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id)
+		return;
+
+	e = linked_regs_push(reg_set);
+	if (e) {
+		e->frameno = frameno;
+		e->is_reg = is_reg;
+		e->regno = spi_or_reg;
+	} else {
+		reg->id = 0;
+	}
+}
+
+/* For all R being scalar registers or spilled scalar registers
+ * in verifier state, save R in linked_regs if R->id == id.
+ * If there are too many Rs sharing same id, reset id for leftover Rs.
+ */
+static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
+				struct linked_regs *linked_regs)
+{
+	struct bpf_func_state *func;
+	struct bpf_reg_state *reg;
+	int i, j;
+
+	id = id & ~BPF_ADD_CONST;
+	for (i = vstate->curframe; i >= 0; i--) {
+		func = vstate->frame[i];
+		for (j = 0; j < BPF_REG_FP; j++) {
+			reg = &func->regs[j];
+			__collect_linked_regs(linked_regs, reg, id, i, j, true);
+		}
+		for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+			if (!is_spilled_reg(&func->stack[j]))
+				continue;
+			reg = &func->stack[j].spilled_ptr;
+			__collect_linked_regs(linked_regs, reg, id, i, j, false);
+		}
+	}
+}
+
+/* For all R in linked_regs, copy known_reg range into R
+ * if R->id == known_reg->id.
+ */
+static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg,
+			     struct linked_regs *linked_regs)
 {
 	struct bpf_reg_state fake_reg;
-	struct bpf_func_state *state;
 	struct bpf_reg_state *reg;
+	struct linked_reg *e;
+	int i;
 
-	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+	for (i = 0; i < linked_regs->cnt; ++i) {
+		e = &linked_regs->entries[i];
+		reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno]
+				: &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr;
 		if (reg->type != SCALAR_VALUE || reg == known_reg)
 			continue;
 		if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
@@ -15112,7 +15299,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
 			copy_register_state(reg, known_reg);
 			/*
 			 * Must preserve off, id and add_const flag,
-			 * otherwise another find_equal_scalars() will be incorrect.
+			 * otherwise another sync_linked_regs() will be incorrect.
 			 */
 			reg->off = saved_off;
 
@@ -15120,7 +15307,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
 			scalar_min_max_add(reg, &fake_reg);
 			reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
 		}
-	}));
+	}
 }
 
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -15131,6 +15318,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
 	struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
 	struct bpf_reg_state *eq_branch_regs;
+	struct linked_regs linked_regs = {};
 	u8 opcode = BPF_OP(insn->code);
 	bool is_jmp32;
 	int pred = -1;
@@ -15245,6 +15433,21 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		return 0;
 	}
 
+	/* Push scalar registers sharing same ID to jump history,
+	 * do this before creating 'other_branch', so that both
+	 * 'this_branch' and 'other_branch' share this history
+	 * if parent state is created.
+	 */
+	if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
+		collect_linked_regs(this_branch, src_reg->id, &linked_regs);
+	if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
+		collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
+	if (linked_regs.cnt > 1) {
+		err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+		if (err)
+			return err;
+	}
+
 	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
 				  false);
 	if (!other_branch)
@@ -15275,13 +15478,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	if (BPF_SRC(insn->code) == BPF_X &&
 	    src_reg->type == SCALAR_VALUE && src_reg->id &&
 	    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
-		find_equal_scalars(this_branch, src_reg);
-		find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+		sync_linked_regs(this_branch, src_reg, &linked_regs);
+		sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs);
 	}
 	if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
 	    !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
-		find_equal_scalars(this_branch, dst_reg);
-		find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+		sync_linked_regs(this_branch, dst_reg, &linked_regs);
+		sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs);
 	}
 
 	/* if one pointer register is compared to another pointer
@@ -16770,7 +16973,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		 *
 		 * First verification path is [1-6]:
 		 * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
-		 * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
+		 * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
 		 *   r7 <= X, because r6 and r7 share same id.
 		 * Next verification path is [1-4, 6].
 		 *
@@ -17563,7 +17766,7 @@ hit:
 			 * the current state.
 			 */
 			if (is_jmp_point(env, env->insn_idx))
-				err = err ? : push_jmp_history(env, cur, 0);
+				err = err ? : push_jmp_history(env, cur, 0, 0);
 			err = err ? : propagate_precision(env, &sl->state);
 			if (err)
 				return err;
@@ -17831,7 +18034,7 @@ static int do_check(struct bpf_verifier_env *env)
 		}
 
 		if (is_jmp_point(env, env->insn_idx)) {
-			err = push_jmp_history(env, state, 0);
+			err = push_jmp_history(env, state, 0, 0);
 			if (err)
 				return err;
 		}
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index 6a6fad625f7e..9d415f7ce599 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -278,7 +278,7 @@ __msg("mark_precise: frame0: last_idx 14 first_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
 __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4")
-__msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0")
+__msg("mark_precise: frame0: regs=r0,r6 stack= before 10: (bf) r6 = r0")
 __msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop")
 /* State entering callback body popped from states stack */
 __msg("from 9 to 17: frame1:")
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 90643ccc221d..64d722199e8f 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -39,11 +39,11 @@
 	.result = VERBOSE_ACCEPT,
 	.errstr =
 	"mark_precise: frame0: last_idx 26 first_idx 20\
-	mark_precise: frame0: regs=r2,r9 stack= before 25\
-	mark_precise: frame0: regs=r2,r9 stack= before 24\
-	mark_precise: frame0: regs=r2,r9 stack= before 23\
-	mark_precise: frame0: regs=r2,r9 stack= before 22\
-	mark_precise: frame0: regs=r2,r9 stack= before 20\
+	mark_precise: frame0: regs=r2 stack= before 25\
+	mark_precise: frame0: regs=r2 stack= before 24\
+	mark_precise: frame0: regs=r2 stack= before 23\
+	mark_precise: frame0: regs=r2 stack= before 22\
+	mark_precise: frame0: regs=r2 stack= before 20\
 	mark_precise: frame0: parent state regs=r2,r9 stack=:\
 	mark_precise: frame0: last_idx 19 first_idx 10\
 	mark_precise: frame0: regs=r2,r9 stack= before 19\
@@ -100,11 +100,11 @@
 	.errstr =
 	"26: (85) call bpf_probe_read_kernel#113\
 	mark_precise: frame0: last_idx 26 first_idx 22\
-	mark_precise: frame0: regs=r2,r9 stack= before 25\
-	mark_precise: frame0: regs=r2,r9 stack= before 24\
-	mark_precise: frame0: regs=r2,r9 stack= before 23\
-	mark_precise: frame0: regs=r2,r9 stack= before 22\
-	mark_precise: frame0: parent state regs=r2,r9 stack=:\
+	mark_precise: frame0: regs=r2 stack= before 25\
+	mark_precise: frame0: regs=r2 stack= before 24\
+	mark_precise: frame0: regs=r2 stack= before 23\
+	mark_precise: frame0: regs=r2 stack= before 22\
+	mark_precise: frame0: parent state regs=r2 stack=:\
 	mark_precise: frame0: last_idx 20 first_idx 20\
 	mark_precise: frame0: regs=r2,r9 stack= before 20\
 	mark_precise: frame0: parent state regs=r2,r9 stack=:\
-- 
cgit v1.2.3


From 842edb5507a1038e009d27e69d13b94b6f085763 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 18 Jul 2024 13:23:54 -0700
Subject: bpf: Remove mark_precise_scalar_ids()

Function mark_precise_scalar_ids() is superseded by
bt_sync_linked_regs() and equal scalars tracking in jump history.
mark_precise_scalar_ids() propagates precision over registers sharing
same ID on parent/child state boundaries, while jump history records
allow bt_sync_linked_regs() to propagate same information with
instruction level granularity, which is strictly more precise.

This commit removes mark_precise_scalar_ids() and updates test cases
in progs/verifier_scalar_ids to reflect new verifier behavior.

The tests are updated in the following manner:
- mark_precise_scalar_ids() propagated precision regardless of
  presence of conditional jumps, while new jump history based logic
  only kicks in when conditional jumps are present.
  Hence test cases are augmented with conditional jumps to still
  trigger precision propagation.
- As equal scalars tracking no longer relies on parent/child state
  boundaries some test cases are no longer interesting,
  such test cases are removed, namely:
  - precision_same_state and precision_cross_state are superseded by
    linked_regs_bpf_k;
  - precision_same_state_broken_link and equal_scalars_broken_link
    are superseded by linked_regs_broken_link.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240718202357.1746514-3-eddyz87@gmail.com
---
 kernel/bpf/verifier.c                              | 115 --------------
 .../selftests/bpf/progs/verifier_scalar_ids.c      | 171 +++++++--------------
 tools/testing/selftests/bpf/verifier/precise.c     |   8 +-
 3 files changed, 59 insertions(+), 235 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8dade7b51430..f8c474e8e597 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4124,96 +4124,6 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
 	}
 }
 
-static bool idset_contains(struct bpf_idset *s, u32 id)
-{
-	u32 i;
-
-	for (i = 0; i < s->count; ++i)
-		if (s->ids[i] == (id & ~BPF_ADD_CONST))
-			return true;
-
-	return false;
-}
-
-static int idset_push(struct bpf_idset *s, u32 id)
-{
-	if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
-		return -EFAULT;
-	s->ids[s->count++] = id & ~BPF_ADD_CONST;
-	return 0;
-}
-
-static void idset_reset(struct bpf_idset *s)
-{
-	s->count = 0;
-}
-
-/* Collect a set of IDs for all registers currently marked as precise in env->bt.
- * Mark all registers with these IDs as precise.
- */
-static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
-	struct bpf_idset *precise_ids = &env->idset_scratch;
-	struct backtrack_state *bt = &env->bt;
-	struct bpf_func_state *func;
-	struct bpf_reg_state *reg;
-	DECLARE_BITMAP(mask, 64);
-	int i, fr;
-
-	idset_reset(precise_ids);
-
-	for (fr = bt->frame; fr >= 0; fr--) {
-		func = st->frame[fr];
-
-		bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
-		for_each_set_bit(i, mask, 32) {
-			reg = &func->regs[i];
-			if (!reg->id || reg->type != SCALAR_VALUE)
-				continue;
-			if (idset_push(precise_ids, reg->id))
-				return -EFAULT;
-		}
-
-		bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
-		for_each_set_bit(i, mask, 64) {
-			if (i >= func->allocated_stack / BPF_REG_SIZE)
-				break;
-			if (!is_spilled_scalar_reg(&func->stack[i]))
-				continue;
-			reg = &func->stack[i].spilled_ptr;
-			if (!reg->id)
-				continue;
-			if (idset_push(precise_ids, reg->id))
-				return -EFAULT;
-		}
-	}
-
-	for (fr = 0; fr <= st->curframe; ++fr) {
-		func = st->frame[fr];
-
-		for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
-			reg = &func->regs[i];
-			if (!reg->id)
-				continue;
-			if (!idset_contains(precise_ids, reg->id))
-				continue;
-			bt_set_frame_reg(bt, fr, i);
-		}
-		for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
-			if (!is_spilled_scalar_reg(&func->stack[i]))
-				continue;
-			reg = &func->stack[i].spilled_ptr;
-			if (!reg->id)
-				continue;
-			if (!idset_contains(precise_ids, reg->id))
-				continue;
-			bt_set_frame_slot(bt, fr, i);
-		}
-	}
-
-	return 0;
-}
-
 /*
  * __mark_chain_precision() backtracks BPF program instruction sequence and
  * chain of verifier states making sure that register *regno* (if regno >= 0)
@@ -4346,31 +4256,6 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				bt->frame, last_idx, first_idx, subseq_idx);
 		}
 
-		/* If some register with scalar ID is marked as precise,
-		 * make sure that all registers sharing this ID are also precise.
-		 * This is needed to estimate effect of find_equal_scalars().
-		 * Do this at the last instruction of each state,
-		 * bpf_reg_state::id fields are valid for these instructions.
-		 *
-		 * Allows to track precision in situation like below:
-		 *
-		 *     r2 = unknown value
-		 *     ...
-		 *   --- state #0 ---
-		 *     ...
-		 *     r1 = r2                 // r1 and r2 now share the same ID
-		 *     ...
-		 *   --- state #1 {r1.id = A, r2.id = A} ---
-		 *     ...
-		 *     if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
-		 *     ...
-		 *   --- state #2 {r1.id = A, r2.id = A} ---
-		 *     r3 = r10
-		 *     r3 += r1                // need to mark both r1 and r2
-		 */
-		if (mark_precise_scalar_ids(env, st))
-			return -EFAULT;
-
 		if (last_idx < 0) {
 			/* we are at the entry into subprog, which
 			 * is expected for global funcs, but only if
diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
index 13b29a7faa71..b642d5c24154 100644
--- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
+++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
@@ -5,54 +5,27 @@
 #include "bpf_misc.h"
 
 /* Check that precision marks propagate through scalar IDs.
- * Registers r{0,1,2} have the same scalar ID at the moment when r0 is
- * marked to be precise, this mark is immediately propagated to r{1,2}.
+ * Registers r{0,1,2} have the same scalar ID.
+ * Range information is propagated for scalars sharing same ID.
+ * Check that precision mark for r0 causes precision marks for r{1,2}
+ * when range information is propagated for 'if <reg> <op> <const>' insn.
  */
 SEC("socket")
 __success __log_level(2)
-__msg("frame0: regs=r0,r1,r2 stack= before 4: (bf) r3 = r10")
-__msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0")
-__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0")
-__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255")
-__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns")
-__flag(BPF_F_TEST_STATE_FREQ)
-__naked void precision_same_state(void)
-{
-	asm volatile (
-	/* r0 = random number up to 0xff */
-	"call %[bpf_ktime_get_ns];"
-	"r0 &= 0xff;"
-	/* tie r0.id == r1.id == r2.id */
-	"r1 = r0;"
-	"r2 = r0;"
-	/* force r0 to be precise, this immediately marks r1 and r2 as
-	 * precise as well because of shared IDs
-	 */
-	"r3 = r10;"
-	"r3 += r0;"
-	"r0 = 0;"
-	"exit;"
-	:
-	: __imm(bpf_ktime_get_ns)
-	: __clobber_all);
-}
-
-/* Same as precision_same_state, but mark propagates through state /
- * parent state boundary.
- */
-SEC("socket")
-__success __log_level(2)
-__msg("frame0: last_idx 6 first_idx 5 subseq_idx -1")
-__msg("frame0: regs=r0,r1,r2 stack= before 5: (bf) r3 = r10")
+/* first 'if' branch */
+__msg("6: (0f) r3 += r0")
+__msg("frame0: regs=r0 stack= before 4: (25) if r1 > 0x7 goto pc+0")
 __msg("frame0: parent state regs=r0,r1,r2 stack=:")
-__msg("frame0: regs=r0,r1,r2 stack= before 4: (05) goto pc+0")
 __msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0")
-__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0")
-__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255")
-__msg("frame0: parent state regs=r0 stack=:")
-__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns")
+/* second 'if' branch */
+__msg("from 4 to 5: ")
+__msg("6: (0f) r3 += r0")
+__msg("frame0: regs=r0 stack= before 5: (bf) r3 = r10")
+__msg("frame0: regs=r0 stack= before 4: (25) if r1 > 0x7 goto pc+0")
+/* parent state already has r{0,1,2} as precise */
+__msg("frame0: parent state regs= stack=:")
 __flag(BPF_F_TEST_STATE_FREQ)
-__naked void precision_cross_state(void)
+__naked void linked_regs_bpf_k(void)
 {
 	asm volatile (
 	/* r0 = random number up to 0xff */
@@ -61,9 +34,8 @@ __naked void precision_cross_state(void)
 	/* tie r0.id == r1.id == r2.id */
 	"r1 = r0;"
 	"r2 = r0;"
-	/* force checkpoint */
-	"goto +0;"
-	/* force r0 to be precise, this immediately marks r1 and r2 as
+	"if r1 > 7 goto +0;"
+	/* force r0 to be precise, this eventually marks r1 and r2 as
 	 * precise as well because of shared IDs
 	 */
 	"r3 = r10;"
@@ -75,59 +47,18 @@ __naked void precision_cross_state(void)
 	: __clobber_all);
 }
 
-/* Same as precision_same_state, but break one of the
+/* Same as linked_regs_bpf_k, but break one of the
  * links, note that r1 is absent from regs=... in __msg below.
  */
 SEC("socket")
 __success __log_level(2)
-__msg("frame0: regs=r0,r2 stack= before 5: (bf) r3 = r10")
-__msg("frame0: regs=r0,r2 stack= before 4: (b7) r1 = 0")
-__msg("frame0: regs=r0,r2 stack= before 3: (bf) r2 = r0")
-__msg("frame0: regs=r0 stack= before 2: (bf) r1 = r0")
-__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255")
-__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns")
-__flag(BPF_F_TEST_STATE_FREQ)
-__naked void precision_same_state_broken_link(void)
-{
-	asm volatile (
-	/* r0 = random number up to 0xff */
-	"call %[bpf_ktime_get_ns];"
-	"r0 &= 0xff;"
-	/* tie r0.id == r1.id == r2.id */
-	"r1 = r0;"
-	"r2 = r0;"
-	/* break link for r1, this is the only line that differs
-	 * compared to the previous test
-	 */
-	"r1 = 0;"
-	/* force r0 to be precise, this immediately marks r1 and r2 as
-	 * precise as well because of shared IDs
-	 */
-	"r3 = r10;"
-	"r3 += r0;"
-	"r0 = 0;"
-	"exit;"
-	:
-	: __imm(bpf_ktime_get_ns)
-	: __clobber_all);
-}
-
-/* Same as precision_same_state_broken_link, but with state /
- * parent state boundary.
- */
-SEC("socket")
-__success __log_level(2)
-__msg("frame0: regs=r0,r2 stack= before 6: (bf) r3 = r10")
-__msg("frame0: regs=r0,r2 stack= before 5: (b7) r1 = 0")
-__msg("frame0: parent state regs=r0,r2 stack=:")
-__msg("frame0: regs=r0,r1,r2 stack= before 4: (05) goto pc+0")
-__msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0")
-__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0")
-__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255")
+__msg("7: (0f) r3 += r0")
+__msg("frame0: regs=r0 stack= before 6: (bf) r3 = r10")
 __msg("frame0: parent state regs=r0 stack=:")
-__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns")
+__msg("frame0: regs=r0 stack= before 5: (25) if r0 > 0x7 goto pc+0")
+__msg("frame0: parent state regs=r0,r2 stack=:")
 __flag(BPF_F_TEST_STATE_FREQ)
-__naked void precision_cross_state_broken_link(void)
+__naked void linked_regs_broken_link(void)
 {
 	asm volatile (
 	/* r0 = random number up to 0xff */
@@ -136,18 +67,13 @@ __naked void precision_cross_state_broken_link(void)
 	/* tie r0.id == r1.id == r2.id */
 	"r1 = r0;"
 	"r2 = r0;"
-	/* force checkpoint, although link between r1 and r{0,2} is
-	 * broken by the next statement current precision tracking
-	 * algorithm can't react to it and propagates mark for r1 to
-	 * the parent state.
-	 */
-	"goto +0;"
 	/* break link for r1, this is the only line that differs
-	 * compared to precision_cross_state()
+	 * compared to the previous test
 	 */
 	"r1 = 0;"
-	/* force r0 to be precise, this immediately marks r1 and r2 as
-	 * precise as well because of shared IDs
+	"if r0 > 7 goto +0;"
+	/* force r0 to be precise,
+	 * this eventually marks r2 as precise because of shared IDs
 	 */
 	"r3 = r10;"
 	"r3 += r0;"
@@ -164,10 +90,16 @@ __naked void precision_cross_state_broken_link(void)
  */
 SEC("socket")
 __success __log_level(2)
-__msg("11: (0f) r2 += r1")
+__msg("12: (0f) r2 += r1")
 /* Current state */
-__msg("frame2: last_idx 11 first_idx 10 subseq_idx -1")
-__msg("frame2: regs=r1 stack= before 10: (bf) r2 = r10")
+__msg("frame2: last_idx 12 first_idx 11 subseq_idx -1 ")
+__msg("frame2: regs=r1 stack= before 11: (bf) r2 = r10")
+__msg("frame2: parent state regs=r1 stack=")
+__msg("frame1: parent state regs= stack=")
+__msg("frame0: parent state regs= stack=")
+/* Parent state */
+__msg("frame2: last_idx 10 first_idx 10 subseq_idx 11 ")
+__msg("frame2: regs=r1 stack= before 10: (25) if r1 > 0x7 goto pc+0")
 __msg("frame2: parent state regs=r1 stack=")
 /* frame1.r{6,7} are marked because mark_precise_scalar_ids()
  * looks for all registers with frame2.r1.id in the current state
@@ -192,7 +124,7 @@ __msg("frame1: regs=r1 stack= before 4: (85) call pc+1")
 __msg("frame0: parent state regs=r1,r6 stack=")
 /* Parent state */
 __msg("frame0: last_idx 3 first_idx 1 subseq_idx 4")
-__msg("frame0: regs=r0,r1,r6 stack= before 3: (bf) r6 = r0")
+__msg("frame0: regs=r1,r6 stack= before 3: (bf) r6 = r0")
 __msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0")
 __msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255")
 __flag(BPF_F_TEST_STATE_FREQ)
@@ -230,7 +162,8 @@ static __naked __noinline __used
 void precision_many_frames__bar(void)
 {
 	asm volatile (
-	/* force r1 to be precise, this immediately marks:
+	"if r1 > 7 goto +0;"
+	/* force r1 to be precise, this eventually marks:
 	 * - bar frame r1
 	 * - foo frame r{1,6,7}
 	 * - main frame r{1,6}
@@ -247,14 +180,16 @@ void precision_many_frames__bar(void)
  */
 SEC("socket")
 __success __log_level(2)
+__msg("11: (0f) r2 += r1")
 /* foo frame */
-__msg("frame1: regs=r1 stack=-8,-16 before 9: (bf) r2 = r10")
+__msg("frame1: regs=r1 stack= before 10: (bf) r2 = r10")
+__msg("frame1: regs=r1 stack= before 9: (25) if r1 > 0x7 goto pc+0")
 __msg("frame1: regs=r1 stack=-8,-16 before 8: (7b) *(u64 *)(r10 -16) = r1")
 __msg("frame1: regs=r1 stack=-8 before 7: (7b) *(u64 *)(r10 -8) = r1")
 __msg("frame1: regs=r1 stack= before 4: (85) call pc+2")
 /* main frame */
-__msg("frame0: regs=r0,r1 stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r1")
-__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0")
+__msg("frame0: regs=r1 stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r1")
+__msg("frame0: regs=r1 stack= before 2: (bf) r1 = r0")
 __msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255")
 __flag(BPF_F_TEST_STATE_FREQ)
 __naked void precision_stack(void)
@@ -283,7 +218,8 @@ void precision_stack__foo(void)
 	 */
 	"*(u64*)(r10 - 8) = r1;"
 	"*(u64*)(r10 - 16) = r1;"
-	/* force r1 to be precise, this immediately marks:
+	"if r1 > 7 goto +0;"
+	/* force r1 to be precise, this eventually marks:
 	 * - foo frame r1,fp{-8,-16}
 	 * - main frame r1,fp{-8}
 	 */
@@ -299,15 +235,17 @@ void precision_stack__foo(void)
 SEC("socket")
 __success __log_level(2)
 /* r{6,7} */
-__msg("11: (0f) r3 += r7")
-__msg("frame0: regs=r6,r7 stack= before 10: (bf) r3 = r10")
+__msg("12: (0f) r3 += r7")
+__msg("frame0: regs=r7 stack= before 11: (bf) r3 = r10")
+__msg("frame0: regs=r7 stack= before 9: (25) if r7 > 0x7 goto pc+0")
 /* ... skip some insns ... */
 __msg("frame0: regs=r6,r7 stack= before 3: (bf) r7 = r0")
 __msg("frame0: regs=r0,r6 stack= before 2: (bf) r6 = r0")
 /* r{8,9} */
-__msg("12: (0f) r3 += r9")
-__msg("frame0: regs=r8,r9 stack= before 11: (0f) r3 += r7")
+__msg("13: (0f) r3 += r9")
+__msg("frame0: regs=r9 stack= before 12: (0f) r3 += r7")
 /* ... skip some insns ... */
+__msg("frame0: regs=r9 stack= before 10: (25) if r9 > 0x7 goto pc+0")
 __msg("frame0: regs=r8,r9 stack= before 7: (bf) r9 = r0")
 __msg("frame0: regs=r0,r8 stack= before 6: (bf) r8 = r0")
 __flag(BPF_F_TEST_STATE_FREQ)
@@ -328,8 +266,9 @@ __naked void precision_two_ids(void)
 	"r9 = r0;"
 	/* clear r0 id */
 	"r0 = 0;"
-	/* force checkpoint */
-	"goto +0;"
+	/* propagate equal scalars precision */
+	"if r7 > 7 goto +0;"
+	"if r9 > 7 goto +0;"
 	"r3 = r10;"
 	/* force r7 to be precise, this also marks r6 */
 	"r3 += r7;"
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 64d722199e8f..59a020c35647 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -106,7 +106,7 @@
 	mark_precise: frame0: regs=r2 stack= before 22\
 	mark_precise: frame0: parent state regs=r2 stack=:\
 	mark_precise: frame0: last_idx 20 first_idx 20\
-	mark_precise: frame0: regs=r2,r9 stack= before 20\
+	mark_precise: frame0: regs=r2 stack= before 20\
 	mark_precise: frame0: parent state regs=r2,r9 stack=:\
 	mark_precise: frame0: last_idx 19 first_idx 17\
 	mark_precise: frame0: regs=r2,r9 stack= before 19\
@@ -183,10 +183,10 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = BPF_F_TEST_STATE_FREQ,
 	.errstr = "mark_precise: frame0: last_idx 7 first_idx 7\
-	mark_precise: frame0: parent state regs=r4 stack=-8:\
+	mark_precise: frame0: parent state regs=r4 stack=:\
 	mark_precise: frame0: last_idx 6 first_idx 4\
-	mark_precise: frame0: regs=r4 stack=-8 before 6: (b7) r0 = -1\
-	mark_precise: frame0: regs=r4 stack=-8 before 5: (79) r4 = *(u64 *)(r10 -8)\
+	mark_precise: frame0: regs=r4 stack= before 6: (b7) r0 = -1\
+	mark_precise: frame0: regs=r4 stack= before 5: (79) r4 = *(u64 *)(r10 -8)\
 	mark_precise: frame0: regs= stack=-8 before 4: (7b) *(u64 *)(r3 -8) = r0\
 	mark_precise: frame0: parent state regs=r0 stack=:\
 	mark_precise: frame0: last_idx 3 first_idx 3\
-- 
cgit v1.2.3


From bebc17b1c03b224a0b4aec6a171815e39f8ba9bc Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 18 Jul 2024 13:23:55 -0700
Subject: selftests/bpf: Tests for per-insn sync_linked_regs() precision
 tracking

Add a few test cases to verify precision tracking for scalars gaining
range because of sync_linked_regs():
- check what happens when more than 6 registers might gain range in
  sync_linked_regs();
- check if precision is propagated correctly when operand of
  conditional jump gained range in sync_linked_regs() and one of
  linked registers is marked precise;
- check if precision is propagated correctly when operand of
  conditional jump gained range in sync_linked_regs() and a
  other-linked operand of the conditional jump is marked precise;
- add a minimized reproducer for precision tracking bug reported in [0];
- Check that mark_chain_precision() for one of the conditional jump
  operands does not trigger equal scalars precision propagation.

[0] https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240718202357.1746514-4-eddyz87@gmail.com
---
 .../selftests/bpf/progs/verifier_scalar_ids.c      | 165 +++++++++++++++++++++
 1 file changed, 165 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
index b642d5c24154..2ecf77b623e0 100644
--- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
+++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
@@ -47,6 +47,72 @@ __naked void linked_regs_bpf_k(void)
 	: __clobber_all);
 }
 
+/* Registers r{0,1,2} share same ID when 'if r1 > ...' insn is processed,
+ * check that verifier marks r{1,2} as precise while backtracking
+ * 'if r1 > ...' with r0 already marked.
+ */
+SEC("socket")
+__success __log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+__msg("frame0: regs=r0 stack= before 5: (2d) if r1 > r3 goto pc+0")
+__msg("frame0: parent state regs=r0,r1,r2,r3 stack=:")
+__msg("frame0: regs=r0,r1,r2,r3 stack= before 4: (b7) r3 = 7")
+__naked void linked_regs_bpf_x_src(void)
+{
+	asm volatile (
+	/* r0 = random number up to 0xff */
+	"call %[bpf_ktime_get_ns];"
+	"r0 &= 0xff;"
+	/* tie r0.id == r1.id == r2.id */
+	"r1 = r0;"
+	"r2 = r0;"
+	"r3 = 7;"
+	"if r1 > r3 goto +0;"
+	/* force r0 to be precise, this eventually marks r1 and r2 as
+	 * precise as well because of shared IDs
+	 */
+	"r4 = r10;"
+	"r4 += r0;"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_ktime_get_ns)
+	: __clobber_all);
+}
+
+/* Registers r{0,1,2} share same ID when 'if r1 > r3' insn is processed,
+ * check that verifier marks r{0,1,2} as precise while backtracking
+ * 'if r1 > r3' with r3 already marked.
+ */
+SEC("socket")
+__success __log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+__msg("frame0: regs=r3 stack= before 5: (2d) if r1 > r3 goto pc+0")
+__msg("frame0: parent state regs=r0,r1,r2,r3 stack=:")
+__msg("frame0: regs=r0,r1,r2,r3 stack= before 4: (b7) r3 = 7")
+__naked void linked_regs_bpf_x_dst(void)
+{
+	asm volatile (
+	/* r0 = random number up to 0xff */
+	"call %[bpf_ktime_get_ns];"
+	"r0 &= 0xff;"
+	/* tie r0.id == r1.id == r2.id */
+	"r1 = r0;"
+	"r2 = r0;"
+	"r3 = 7;"
+	"if r1 > r3 goto +0;"
+	/* force r0 to be precise, this eventually marks r1 and r2 as
+	 * precise as well because of shared IDs
+	 */
+	"r4 = r10;"
+	"r4 += r3;"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_ktime_get_ns)
+	: __clobber_all);
+}
+
 /* Same as linked_regs_bpf_k, but break one of the
  * links, note that r1 is absent from regs=... in __msg below.
  */
@@ -280,6 +346,105 @@ __naked void precision_two_ids(void)
 	: __clobber_all);
 }
 
+SEC("socket")
+__success __log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+/* check thar r0 and r6 have different IDs after 'if',
+ * collect_linked_regs() can't tie more than 6 registers for a single insn.
+ */
+__msg("8: (25) if r0 > 0x7 goto pc+0         ; R0=scalar(id=1")
+__msg("9: (bf) r6 = r6                       ; R6_w=scalar(id=2")
+/* check that r{0-5} are marked precise after 'if' */
+__msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0")
+__msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:")
+__naked void linked_regs_too_many_regs(void)
+{
+	asm volatile (
+	/* r0 = random number up to 0xff */
+	"call %[bpf_ktime_get_ns];"
+	"r0 &= 0xff;"
+	/* tie r{0-6} IDs */
+	"r1 = r0;"
+	"r2 = r0;"
+	"r3 = r0;"
+	"r4 = r0;"
+	"r5 = r0;"
+	"r6 = r0;"
+	/* propagate range for r{0-6} */
+	"if r0 > 7 goto +0;"
+	/* make r6 appear in the log */
+	"r6 = r6;"
+	/* force r0 to be precise,
+	 * this would cause r{0-4} to be precise because of shared IDs
+	 */
+	"r7 = r10;"
+	"r7 += r0;"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_ktime_get_ns)
+	: __clobber_all);
+}
+
+SEC("socket")
+__failure __log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+__msg("regs=r7 stack= before 5: (3d) if r8 >= r0")
+__msg("parent state regs=r0,r7,r8")
+__msg("regs=r0,r7,r8 stack= before 4: (25) if r0 > 0x1")
+__msg("div by zero")
+__naked void linked_regs_broken_link_2(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"r7 = r0;"
+	"r8 = r0;"
+	"call %[bpf_get_prandom_u32];"
+	"if r0 > 1 goto +0;"
+	/* r7.id == r8.id,
+	 * thus r7 precision implies r8 precision,
+	 * which implies r0 precision because of the conditional below.
+	 */
+	"if r8 >= r0 goto 1f;"
+	/* break id relation between r7 and r8 */
+	"r8 += r8;"
+	/* make r7 precise */
+	"if r7 == 0 goto 1f;"
+	"r0 /= 0;"
+"1:"
+	"r0 = 42;"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Check that mark_chain_precision() for one of the conditional jump
+ * operands does not trigger equal scalars precision propagation.
+ */
+SEC("socket")
+__success __log_level(2)
+__msg("3: (25) if r1 > 0x100 goto pc+0")
+__msg("frame0: regs=r1 stack= before 2: (bf) r1 = r0")
+__naked void cjmp_no_linked_regs_trigger(void)
+{
+	asm volatile (
+	/* r0 = random number up to 0xff */
+	"call %[bpf_ktime_get_ns];"
+	"r0 &= 0xff;"
+	/* tie r0.id == r1.id */
+	"r1 = r0;"
+	/* the jump below would be predicted, thus r1 would be marked precise,
+	 * this should not imply precision mark for r0
+	 */
+	"if r1 > 256 goto +0;"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_ktime_get_ns)
+	: __clobber_all);
+}
+
 /* Verify that check_ids() is used by regsafe() for scalars.
  *
  * r9 = ... some pointer with range X ...
-- 
cgit v1.2.3


From cfbf25481d6dec0089c99c9d33a2ea634fe8f008 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 18 Jul 2024 13:23:56 -0700
Subject: selftests/bpf: Update comments find_equal_scalars->sync_linked_regs

find_equal_scalars() is renamed to sync_linked_regs(),
this commit updates existing references in the selftests comments.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240718202357.1746514-5-eddyz87@gmail.com
---
 tools/testing/selftests/bpf/progs/verifier_spill_fill.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
index 85e48069c9e6..9d288ec7a168 100644
--- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
+++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
@@ -402,7 +402,7 @@ __naked void spill_32bit_of_64bit_fail(void)
 	*(u32*)(r10 - 8) = r1;				\
 	/* 32-bit fill r2 from stack. */		\
 	r2 = *(u32*)(r10 - 8);				\
-	/* Compare r2 with another register to trigger find_equal_scalars.\
+	/* Compare r2 with another register to trigger sync_linked_regs.\
 	 * Having one random bit is important here, otherwise the verifier cuts\
 	 * the corners. If the ID was mistakenly preserved on spill, this would\
 	 * cause the verifier to think that r1 is also equal to zero in one of\
@@ -441,7 +441,7 @@ __naked void spill_16bit_of_32bit_fail(void)
 	*(u16*)(r10 - 8) = r1;				\
 	/* 16-bit fill r2 from stack. */		\
 	r2 = *(u16*)(r10 - 8);				\
-	/* Compare r2 with another register to trigger find_equal_scalars.\
+	/* Compare r2 with another register to trigger sync_linked_regs.\
 	 * Having one random bit is important here, otherwise the verifier cuts\
 	 * the corners. If the ID was mistakenly preserved on spill, this would\
 	 * cause the verifier to think that r1 is also equal to zero in one of\
@@ -833,7 +833,7 @@ __naked void spill_64bit_of_64bit_ok(void)
 	*(u64*)(r10 - 8) = r0;				\
 	/* 64-bit fill r1 from stack - should preserve the ID. */\
 	r1 = *(u64*)(r10 - 8);				\
-	/* Compare r1 with another register to trigger find_equal_scalars.\
+	/* Compare r1 with another register to trigger sync_linked_regs.\
 	 * Having one random bit is important here, otherwise the verifier cuts\
 	 * the corners.					\
 	 */						\
@@ -866,7 +866,7 @@ __naked void spill_32bit_of_32bit_ok(void)
 	*(u32*)(r10 - 8) = r0;				\
 	/* 32-bit fill r1 from stack - should preserve the ID. */\
 	r1 = *(u32*)(r10 - 8);				\
-	/* Compare r1 with another register to trigger find_equal_scalars.\
+	/* Compare r1 with another register to trigger sync_linked_regs.\
 	 * Having one random bit is important here, otherwise the verifier cuts\
 	 * the corners.					\
 	 */						\
@@ -899,7 +899,7 @@ __naked void spill_16bit_of_16bit_ok(void)
 	*(u16*)(r10 - 8) = r0;				\
 	/* 16-bit fill r1 from stack - should preserve the ID. */\
 	r1 = *(u16*)(r10 - 8);				\
-	/* Compare r1 with another register to trigger find_equal_scalars.\
+	/* Compare r1 with another register to trigger sync_linked_regs.\
 	 * Having one random bit is important here, otherwise the verifier cuts\
 	 * the corners.					\
 	 */						\
@@ -932,7 +932,7 @@ __naked void spill_8bit_of_8bit_ok(void)
 	*(u8*)(r10 - 8) = r0;				\
 	/* 8-bit fill r1 from stack - should preserve the ID. */\
 	r1 = *(u8*)(r10 - 8);				\
-	/* Compare r1 with another register to trigger find_equal_scalars.\
+	/* Compare r1 with another register to trigger sync_linked_regs.\
 	 * Having one random bit is important here, otherwise the verifier cuts\
 	 * the corners.					\
 	 */						\
@@ -1029,7 +1029,7 @@ __naked void fill_32bit_after_spill_64bit_preserve_id(void)
 	"r1 = *(u32*)(r10 - 4);"
 #endif
 	"						\
-	/* Compare r1 with another register to trigger find_equal_scalars. */\
+	/* Compare r1 with another register to trigger sync_linked_regs. */\
 	r2 = 0;						\
 	if r1 != r2 goto l0_%=;				\
 	/* The result of this comparison is predefined. */\
@@ -1070,7 +1070,7 @@ __naked void fill_32bit_after_spill_64bit_clear_id(void)
 	"r2 = *(u32*)(r10 - 4);"
 #endif
 	"						\
-	/* Compare r2 with another register to trigger find_equal_scalars.\
+	/* Compare r2 with another register to trigger sync_linked_regs.\
 	 * Having one random bit is important here, otherwise the verifier cuts\
 	 * the corners. If the ID was mistakenly preserved on fill, this would\
 	 * cause the verifier to think that r1 is also equal to zero in one of\
-- 
cgit v1.2.3


From b83b936f3e9a3c63896852198a1814e90e68eef5 Mon Sep 17 00:00:00 2001
From: Leon Hwang <hffilwlqm@gmail.com>
Date: Sun, 14 Jul 2024 20:39:02 +0800
Subject: selftests/bpf: Add testcases for tailcall hierarchy fixing

Add some test cases to confirm the tailcall hierarchy issue has been fixed.

On x64, the selftests result is:

cd tools/testing/selftests/bpf && ./test_progs -t tailcalls
327/18  tailcalls/tailcall_bpf2bpf_hierarchy_1:OK
327/19  tailcalls/tailcall_bpf2bpf_hierarchy_fentry:OK
327/20  tailcalls/tailcall_bpf2bpf_hierarchy_fexit:OK
327/21  tailcalls/tailcall_bpf2bpf_hierarchy_fentry_fexit:OK
327/22  tailcalls/tailcall_bpf2bpf_hierarchy_fentry_entry:OK
327/23  tailcalls/tailcall_bpf2bpf_hierarchy_2:OK
327/24  tailcalls/tailcall_bpf2bpf_hierarchy_3:OK
327     tailcalls:OK
Summary: 1/24 PASSED, 0 SKIPPED, 0 FAILED

On arm64, the selftests result is:

cd tools/testing/selftests/bpf && ./test_progs -t tailcalls
327/18  tailcalls/tailcall_bpf2bpf_hierarchy_1:OK
327/19  tailcalls/tailcall_bpf2bpf_hierarchy_fentry:OK
327/20  tailcalls/tailcall_bpf2bpf_hierarchy_fexit:OK
327/21  tailcalls/tailcall_bpf2bpf_hierarchy_fentry_fexit:OK
327/22  tailcalls/tailcall_bpf2bpf_hierarchy_fentry_entry:OK
327/23  tailcalls/tailcall_bpf2bpf_hierarchy_2:OK
327/24  tailcalls/tailcall_bpf2bpf_hierarchy_3:OK
327     tailcalls:OK
Summary: 1/24 PASSED, 0 SKIPPED, 0 FAILED

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
Link: https://lore.kernel.org/r/20240714123902.32305-4-hffilwlqm@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/tailcalls.c | 320 +++++++++++++++++++++
 .../bpf/progs/tailcall_bpf2bpf_hierarchy1.c        |  34 +++
 .../bpf/progs/tailcall_bpf2bpf_hierarchy2.c        |  70 +++++
 .../bpf/progs/tailcall_bpf2bpf_hierarchy3.c        |  62 ++++
 .../bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c  |  35 +++
 tools/testing/selftests/bpf/progs/tc_dummy.c       |  12 +
 6 files changed, 533 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c
 create mode 100644 tools/testing/selftests/bpf/progs/tc_dummy.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index 59993fc9c0d7..e01fabb8cc41 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -3,6 +3,8 @@
 #include <test_progs.h>
 #include <network_helpers.h>
 #include "tailcall_poke.skel.h"
+#include "tailcall_bpf2bpf_hierarchy2.skel.h"
+#include "tailcall_bpf2bpf_hierarchy3.skel.h"
 
 
 /* test_tailcall_1 checks basic functionality by patching multiple locations
@@ -1187,6 +1189,312 @@ out:
 	tailcall_poke__destroy(call);
 }
 
+static void test_tailcall_hierarchy_count(const char *which, bool test_fentry,
+					  bool test_fexit,
+					  bool test_fentry_entry)
+{
+	int err, map_fd, prog_fd, main_data_fd, fentry_data_fd, fexit_data_fd, i, val;
+	struct bpf_object *obj = NULL, *fentry_obj = NULL, *fexit_obj = NULL;
+	struct bpf_link *fentry_link = NULL, *fexit_link = NULL;
+	struct bpf_program *prog, *fentry_prog;
+	struct bpf_map *prog_array, *data_map;
+	int fentry_prog_fd;
+	char buff[128] = {};
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = buff,
+		.data_size_in = sizeof(buff),
+		.repeat = 1,
+	);
+
+	err = bpf_prog_test_load(which, BPF_PROG_TYPE_SCHED_CLS, &obj,
+				 &prog_fd);
+	if (!ASSERT_OK(err, "load obj"))
+		return;
+
+	prog = bpf_object__find_program_by_name(obj, "entry");
+	if (!ASSERT_OK_PTR(prog, "find entry prog"))
+		goto out;
+
+	prog_fd = bpf_program__fd(prog);
+	if (!ASSERT_GE(prog_fd, 0, "prog_fd"))
+		goto out;
+
+	if (test_fentry_entry) {
+		fentry_obj = bpf_object__open_file("tailcall_bpf2bpf_hierarchy_fentry.bpf.o",
+						   NULL);
+		if (!ASSERT_OK_PTR(fentry_obj, "open fentry_obj file"))
+			goto out;
+
+		fentry_prog = bpf_object__find_program_by_name(fentry_obj,
+							       "fentry");
+		if (!ASSERT_OK_PTR(prog, "find fentry prog"))
+			goto out;
+
+		err = bpf_program__set_attach_target(fentry_prog, prog_fd,
+						     "entry");
+		if (!ASSERT_OK(err, "set_attach_target entry"))
+			goto out;
+
+		err = bpf_object__load(fentry_obj);
+		if (!ASSERT_OK(err, "load fentry_obj"))
+			goto out;
+
+		fentry_link = bpf_program__attach_trace(fentry_prog);
+		if (!ASSERT_OK_PTR(fentry_link, "attach_trace"))
+			goto out;
+
+		fentry_prog_fd = bpf_program__fd(fentry_prog);
+		if (!ASSERT_GE(fentry_prog_fd, 0, "fentry_prog_fd"))
+			goto out;
+
+		prog_array = bpf_object__find_map_by_name(fentry_obj, "jmp_table");
+		if (!ASSERT_OK_PTR(prog_array, "find jmp_table"))
+			goto out;
+
+		map_fd = bpf_map__fd(prog_array);
+		if (!ASSERT_GE(map_fd, 0, "map_fd"))
+			goto out;
+
+		i = 0;
+		err = bpf_map_update_elem(map_fd, &i, &fentry_prog_fd, BPF_ANY);
+		if (!ASSERT_OK(err, "update jmp_table"))
+			goto out;
+
+		data_map = bpf_object__find_map_by_name(fentry_obj, ".bss");
+		if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map),
+				  "find data_map"))
+			goto out;
+
+	} else {
+		prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+		if (!ASSERT_OK_PTR(prog_array, "find jmp_table"))
+			goto out;
+
+		map_fd = bpf_map__fd(prog_array);
+		if (!ASSERT_GE(map_fd, 0, "map_fd"))
+			goto out;
+
+		i = 0;
+		err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+		if (!ASSERT_OK(err, "update jmp_table"))
+			goto out;
+
+		data_map = bpf_object__find_map_by_name(obj, ".bss");
+		if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map),
+				  "find data_map"))
+			goto out;
+	}
+
+	if (test_fentry) {
+		fentry_obj = bpf_object__open_file("tailcall_bpf2bpf_fentry.bpf.o",
+						   NULL);
+		if (!ASSERT_OK_PTR(fentry_obj, "open fentry_obj file"))
+			goto out;
+
+		prog = bpf_object__find_program_by_name(fentry_obj, "fentry");
+		if (!ASSERT_OK_PTR(prog, "find fentry prog"))
+			goto out;
+
+		err = bpf_program__set_attach_target(prog, prog_fd,
+						     "subprog_tail");
+		if (!ASSERT_OK(err, "set_attach_target subprog_tail"))
+			goto out;
+
+		err = bpf_object__load(fentry_obj);
+		if (!ASSERT_OK(err, "load fentry_obj"))
+			goto out;
+
+		fentry_link = bpf_program__attach_trace(prog);
+		if (!ASSERT_OK_PTR(fentry_link, "attach_trace"))
+			goto out;
+	}
+
+	if (test_fexit) {
+		fexit_obj = bpf_object__open_file("tailcall_bpf2bpf_fexit.bpf.o",
+						  NULL);
+		if (!ASSERT_OK_PTR(fexit_obj, "open fexit_obj file"))
+			goto out;
+
+		prog = bpf_object__find_program_by_name(fexit_obj, "fexit");
+		if (!ASSERT_OK_PTR(prog, "find fexit prog"))
+			goto out;
+
+		err = bpf_program__set_attach_target(prog, prog_fd,
+						     "subprog_tail");
+		if (!ASSERT_OK(err, "set_attach_target subprog_tail"))
+			goto out;
+
+		err = bpf_object__load(fexit_obj);
+		if (!ASSERT_OK(err, "load fexit_obj"))
+			goto out;
+
+		fexit_link = bpf_program__attach_trace(prog);
+		if (!ASSERT_OK_PTR(fexit_link, "attach_trace"))
+			goto out;
+	}
+
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "tailcall");
+	ASSERT_EQ(topts.retval, 1, "tailcall retval");
+
+	main_data_fd = bpf_map__fd(data_map);
+	if (!ASSERT_GE(main_data_fd, 0, "main_data_fd"))
+		goto out;
+
+	i = 0;
+	err = bpf_map_lookup_elem(main_data_fd, &i, &val);
+	ASSERT_OK(err, "tailcall count");
+	ASSERT_EQ(val, 34, "tailcall count");
+
+	if (test_fentry) {
+		data_map = bpf_object__find_map_by_name(fentry_obj, ".bss");
+		if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map),
+				  "find tailcall_bpf2bpf_fentry.bss map"))
+			goto out;
+
+		fentry_data_fd = bpf_map__fd(data_map);
+		if (!ASSERT_GE(fentry_data_fd, 0,
+				  "find tailcall_bpf2bpf_fentry.bss map fd"))
+			goto out;
+
+		i = 0;
+		err = bpf_map_lookup_elem(fentry_data_fd, &i, &val);
+		ASSERT_OK(err, "fentry count");
+		ASSERT_EQ(val, 68, "fentry count");
+	}
+
+	if (test_fexit) {
+		data_map = bpf_object__find_map_by_name(fexit_obj, ".bss");
+		if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map),
+				  "find tailcall_bpf2bpf_fexit.bss map"))
+			goto out;
+
+		fexit_data_fd = bpf_map__fd(data_map);
+		if (!ASSERT_GE(fexit_data_fd, 0,
+				  "find tailcall_bpf2bpf_fexit.bss map fd"))
+			goto out;
+
+		i = 0;
+		err = bpf_map_lookup_elem(fexit_data_fd, &i, &val);
+		ASSERT_OK(err, "fexit count");
+		ASSERT_EQ(val, 68, "fexit count");
+	}
+
+	i = 0;
+	err = bpf_map_delete_elem(map_fd, &i);
+	if (!ASSERT_OK(err, "delete_elem from jmp_table"))
+		goto out;
+
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "tailcall");
+	ASSERT_EQ(topts.retval, 1, "tailcall retval");
+
+	i = 0;
+	err = bpf_map_lookup_elem(main_data_fd, &i, &val);
+	ASSERT_OK(err, "tailcall count");
+	ASSERT_EQ(val, 35, "tailcall count");
+
+	if (test_fentry) {
+		i = 0;
+		err = bpf_map_lookup_elem(fentry_data_fd, &i, &val);
+		ASSERT_OK(err, "fentry count");
+		ASSERT_EQ(val, 70, "fentry count");
+	}
+
+	if (test_fexit) {
+		i = 0;
+		err = bpf_map_lookup_elem(fexit_data_fd, &i, &val);
+		ASSERT_OK(err, "fexit count");
+		ASSERT_EQ(val, 70, "fexit count");
+	}
+
+out:
+	bpf_link__destroy(fentry_link);
+	bpf_link__destroy(fexit_link);
+	bpf_object__close(fentry_obj);
+	bpf_object__close(fexit_obj);
+	bpf_object__close(obj);
+}
+
+/* test_tailcall_bpf2bpf_hierarchy_1 checks that the count value of the tail
+ * call limit enforcement matches with expectations when tailcalls are preceded
+ * with two bpf2bpf calls.
+ *
+ *         subprog --tailcall-> entry
+ * entry <
+ *         subprog --tailcall-> entry
+ */
+static void test_tailcall_bpf2bpf_hierarchy_1(void)
+{
+	test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o",
+				      false, false, false);
+}
+
+/* test_tailcall_bpf2bpf_hierarchy_fentry checks that the count value of the
+ * tail call limit enforcement matches with expectations when tailcalls are
+ * preceded with two bpf2bpf calls, and the two subprogs are traced by fentry.
+ */
+static void test_tailcall_bpf2bpf_hierarchy_fentry(void)
+{
+	test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o",
+				      true, false, false);
+}
+
+/* test_tailcall_bpf2bpf_hierarchy_fexit checks that the count value of the tail
+ * call limit enforcement matches with expectations when tailcalls are preceded
+ * with two bpf2bpf calls, and the two subprogs are traced by fexit.
+ */
+static void test_tailcall_bpf2bpf_hierarchy_fexit(void)
+{
+	test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o",
+				      false, true, false);
+}
+
+/* test_tailcall_bpf2bpf_hierarchy_fentry_fexit checks that the count value of
+ * the tail call limit enforcement matches with expectations when tailcalls are
+ * preceded with two bpf2bpf calls, and the two subprogs are traced by both
+ * fentry and fexit.
+ */
+static void test_tailcall_bpf2bpf_hierarchy_fentry_fexit(void)
+{
+	test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o",
+				      true, true, false);
+}
+
+/* test_tailcall_bpf2bpf_hierarchy_fentry_entry checks that the count value of
+ * the tail call limit enforcement matches with expectations when tailcalls are
+ * preceded with two bpf2bpf calls in fentry.
+ */
+static void test_tailcall_bpf2bpf_hierarchy_fentry_entry(void)
+{
+	test_tailcall_hierarchy_count("tc_dummy.bpf.o", false, false, true);
+}
+
+/* test_tailcall_bpf2bpf_hierarchy_2 checks that the count value of the tail
+ * call limit enforcement matches with expectations:
+ *
+ *         subprog_tail0 --tailcall-> classifier_0 -> subprog_tail0
+ * entry <
+ *         subprog_tail1 --tailcall-> classifier_1 -> subprog_tail1
+ */
+static void test_tailcall_bpf2bpf_hierarchy_2(void)
+{
+	RUN_TESTS(tailcall_bpf2bpf_hierarchy2);
+}
+
+/* test_tailcall_bpf2bpf_hierarchy_3 checks that the count value of the tail
+ * call limit enforcement matches with expectations:
+ *
+ *                                   subprog with jmp_table0 to classifier_0
+ * entry --tailcall-> classifier_0 <
+ *                                   subprog with jmp_table1 to classifier_0
+ */
+static void test_tailcall_bpf2bpf_hierarchy_3(void)
+{
+	RUN_TESTS(tailcall_bpf2bpf_hierarchy3);
+}
+
 void test_tailcalls(void)
 {
 	if (test__start_subtest("tailcall_1"))
@@ -1223,4 +1531,16 @@ void test_tailcalls(void)
 		test_tailcall_bpf2bpf_fentry_entry();
 	if (test__start_subtest("tailcall_poke"))
 		test_tailcall_poke();
+	if (test__start_subtest("tailcall_bpf2bpf_hierarchy_1"))
+		test_tailcall_bpf2bpf_hierarchy_1();
+	if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry"))
+		test_tailcall_bpf2bpf_hierarchy_fentry();
+	if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fexit"))
+		test_tailcall_bpf2bpf_hierarchy_fexit();
+	if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry_fexit"))
+		test_tailcall_bpf2bpf_hierarchy_fentry_fexit();
+	if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry_entry"))
+		test_tailcall_bpf2bpf_hierarchy_fentry_entry();
+	test_tailcall_bpf2bpf_hierarchy_2();
+	test_tailcall_bpf2bpf_hierarchy_3();
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c
new file mode 100644
index 000000000000..327ca395e860
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+int count = 0;
+
+static __noinline
+int subprog_tail(struct __sk_buff *skb)
+{
+	bpf_tail_call_static(skb, &jmp_table, 0);
+	return 0;
+}
+
+SEC("tc")
+int entry(struct __sk_buff *skb)
+{
+	int ret = 1;
+
+	count++;
+	subprog_tail(skb);
+	subprog_tail(skb);
+
+	return ret;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
new file mode 100644
index 000000000000..37604b0b97af
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+int classifier_0(struct __sk_buff *skb);
+int classifier_1(struct __sk_buff *skb);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 2);
+	__uint(key_size, sizeof(__u32));
+	__array(values, void (void));
+} jmp_table SEC(".maps") = {
+	.values = {
+		[0] = (void *) &classifier_0,
+		[1] = (void *) &classifier_1,
+	},
+};
+
+int count0 = 0;
+int count1 = 0;
+
+static __noinline
+int subprog_tail0(struct __sk_buff *skb)
+{
+	bpf_tail_call_static(skb, &jmp_table, 0);
+	return 0;
+}
+
+__auxiliary
+SEC("tc")
+int classifier_0(struct __sk_buff *skb)
+{
+	count0++;
+	subprog_tail0(skb);
+	return 0;
+}
+
+static __noinline
+int subprog_tail1(struct __sk_buff *skb)
+{
+	bpf_tail_call_static(skb, &jmp_table, 1);
+	return 0;
+}
+
+__auxiliary
+SEC("tc")
+int classifier_1(struct __sk_buff *skb)
+{
+	count1++;
+	subprog_tail1(skb);
+	return 0;
+}
+
+__success
+__retval(33)
+SEC("tc")
+int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb)
+{
+	volatile int ret = 0;
+
+	subprog_tail0(skb);
+	subprog_tail1(skb);
+
+	asm volatile (""::"r+"(ret));
+	return (count1 << 16) | count0;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
new file mode 100644
index 000000000000..0cdbb781fcbc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+int classifier_0(struct __sk_buff *skb);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__array(values, void (void));
+} jmp_table0 SEC(".maps") = {
+	.values = {
+		[0] = (void *) &classifier_0,
+	},
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__array(values, void (void));
+} jmp_table1 SEC(".maps") = {
+	.values = {
+		[0] = (void *) &classifier_0,
+	},
+};
+
+int count = 0;
+
+static __noinline
+int subprog_tail(struct __sk_buff *skb, void *jmp_table)
+{
+	bpf_tail_call_static(skb, jmp_table, 0);
+	return 0;
+}
+
+__auxiliary
+SEC("tc")
+int classifier_0(struct __sk_buff *skb)
+{
+	count++;
+	subprog_tail(skb, &jmp_table0);
+	subprog_tail(skb, &jmp_table1);
+	return count;
+}
+
+__success
+__retval(33)
+SEC("tc")
+int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb)
+{
+	volatile int ret = 0;
+
+	bpf_tail_call_static(skb, &jmp_table0, 0);
+
+	asm volatile (""::"r+"(ret));
+	return ret;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c
new file mode 100644
index 000000000000..c87f9ca982d3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Leon Hwang */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+int count = 0;
+
+static __noinline
+int subprog_tail(void *ctx)
+{
+	bpf_tail_call_static(ctx, &jmp_table, 0);
+	return 0;
+}
+
+SEC("fentry/dummy")
+int BPF_PROG(fentry, struct sk_buff *skb)
+{
+	count++;
+	subprog_tail(ctx);
+	subprog_tail(ctx);
+
+	return 0;
+}
+
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tc_dummy.c b/tools/testing/selftests/bpf/progs/tc_dummy.c
new file mode 100644
index 000000000000..69a3d0dc8787
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tc_dummy.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+SEC("tc")
+int entry(struct __sk_buff *skb)
+{
+	return 1;
+}
+
+char __license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From b7264f87f76c59698c7fd019a8797378fa17e7e3 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@gmail.com>
Date: Sun, 21 Jul 2024 22:33:50 +0800
Subject: bpftool: Refactor xdp attach/detach type judgment

This commit no logical changed, just increases code readability and
facilitates TCX prog expansion, which will be implemented in the next
patch.

Signed-off-by: Tao Chen <chen.dylane@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240721143353.95980-2-chen.dylane@gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/bpf/bpftool/net.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index 968714b4c3d4..1b9f4225b394 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -684,10 +684,18 @@ static int do_attach(int argc, char **argv)
 		}
 	}
 
+	switch (attach_type) {
 	/* attach xdp prog */
-	if (is_prefix("xdp", attach_type_strings[attach_type]))
-		err = do_attach_detach_xdp(progfd, attach_type, ifindex,
-					   overwrite);
+	case NET_ATTACH_TYPE_XDP:
+	case NET_ATTACH_TYPE_XDP_GENERIC:
+	case NET_ATTACH_TYPE_XDP_DRIVER:
+	case NET_ATTACH_TYPE_XDP_OFFLOAD:
+		err = do_attach_detach_xdp(progfd, attach_type, ifindex, overwrite);
+		break;
+	default:
+		break;
+	}
+
 	if (err) {
 		p_err("interface %s attach failed: %s",
 		      attach_type_strings[attach_type], strerror(-err));
@@ -721,10 +729,18 @@ static int do_detach(int argc, char **argv)
 	if (ifindex < 1)
 		return -EINVAL;
 
+	switch (attach_type) {
 	/* detach xdp prog */
-	progfd = -1;
-	if (is_prefix("xdp", attach_type_strings[attach_type]))
+	case NET_ATTACH_TYPE_XDP:
+	case NET_ATTACH_TYPE_XDP_GENERIC:
+	case NET_ATTACH_TYPE_XDP_DRIVER:
+	case NET_ATTACH_TYPE_XDP_OFFLOAD:
+		progfd = -1;
 		err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL);
+		break;
+	default:
+		break;
+	}
 
 	if (err < 0) {
 		p_err("interface %s detach failed: %s",
-- 
cgit v1.2.3


From 3b9d4fee8ad32599ba20ab36da7694feef022b48 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@gmail.com>
Date: Sun, 21 Jul 2024 22:42:21 +0800
Subject: bpftool: Add net attach/detach command to tcx prog

Now, attach/detach tcx prog supported in libbpf, so we can add new
command 'bpftool attach/detach tcx' to attach tcx prog with bpftool
for user.

 # bpftool prog load tc_prog.bpf.o /sys/fs/bpf/tc_prog
 # bpftool prog show
	...
	192: sched_cls  name tc_prog  tag 187aeb611ad00cfc  gpl
	loaded_at 2024-07-11T15:58:16+0800  uid 0
	xlated 152B  jited 97B  memlock 4096B  map_ids 100,99,97
	btf_id 260
 # bpftool net attach tcx_ingress name tc_prog dev lo
 # bpftool net
	...
	tc:
	lo(1) tcx/ingress tc_prog prog_id 29

 # bpftool net detach tcx_ingress dev lo
 # bpftool net
	...
	tc:
 # bpftool net attach tcx_ingress name tc_prog dev lo
 # bpftool net
	tc:
	lo(1) tcx/ingress tc_prog prog_id 29

Test environment: ubuntu_22_04, 6.7.0-060700-generic

Signed-off-by: Tao Chen <chen.dylane@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240721144221.96228-1-chen.dylane@gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/bpf/bpftool/net.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index 1b9f4225b394..2a51f1c25732 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -67,6 +67,8 @@ enum net_attach_type {
 	NET_ATTACH_TYPE_XDP_GENERIC,
 	NET_ATTACH_TYPE_XDP_DRIVER,
 	NET_ATTACH_TYPE_XDP_OFFLOAD,
+	NET_ATTACH_TYPE_TCX_INGRESS,
+	NET_ATTACH_TYPE_TCX_EGRESS,
 };
 
 static const char * const attach_type_strings[] = {
@@ -74,6 +76,8 @@ static const char * const attach_type_strings[] = {
 	[NET_ATTACH_TYPE_XDP_GENERIC]	= "xdpgeneric",
 	[NET_ATTACH_TYPE_XDP_DRIVER]	= "xdpdrv",
 	[NET_ATTACH_TYPE_XDP_OFFLOAD]	= "xdpoffload",
+	[NET_ATTACH_TYPE_TCX_INGRESS]	= "tcx_ingress",
+	[NET_ATTACH_TYPE_TCX_EGRESS]	= "tcx_egress",
 };
 
 static const char * const attach_loc_strings[] = {
@@ -647,6 +651,32 @@ static int do_attach_detach_xdp(int progfd, enum net_attach_type attach_type,
 	return bpf_xdp_attach(ifindex, progfd, flags, NULL);
 }
 
+static int get_tcx_type(enum net_attach_type attach_type)
+{
+	switch (attach_type) {
+	case NET_ATTACH_TYPE_TCX_INGRESS:
+		return BPF_TCX_INGRESS;
+	case NET_ATTACH_TYPE_TCX_EGRESS:
+		return BPF_TCX_EGRESS;
+	default:
+		return -1;
+	}
+}
+
+static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex)
+{
+	int type = get_tcx_type(attach_type);
+
+	return bpf_prog_attach(progfd, ifindex, type, 0);
+}
+
+static int do_detach_tcx(int targetfd, enum net_attach_type attach_type)
+{
+	int type = get_tcx_type(attach_type);
+
+	return bpf_prog_detach(targetfd, type);
+}
+
 static int do_attach(int argc, char **argv)
 {
 	enum net_attach_type attach_type;
@@ -692,6 +722,11 @@ static int do_attach(int argc, char **argv)
 	case NET_ATTACH_TYPE_XDP_OFFLOAD:
 		err = do_attach_detach_xdp(progfd, attach_type, ifindex, overwrite);
 		break;
+	/* attach tcx prog */
+	case NET_ATTACH_TYPE_TCX_INGRESS:
+	case NET_ATTACH_TYPE_TCX_EGRESS:
+		err = do_attach_tcx(progfd, attach_type, ifindex);
+		break;
 	default:
 		break;
 	}
@@ -738,6 +773,11 @@ static int do_detach(int argc, char **argv)
 		progfd = -1;
 		err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL);
 		break;
+	/* detach tcx prog */
+	case NET_ATTACH_TYPE_TCX_INGRESS:
+	case NET_ATTACH_TYPE_TCX_EGRESS:
+		err = do_detach_tcx(ifindex, attach_type);
+		break;
 	default:
 		break;
 	}
@@ -944,7 +984,8 @@ static int do_help(int argc, char **argv)
 		"       %1$s %2$s help\n"
 		"\n"
 		"       " HELP_SPEC_PROGRAM "\n"
-		"       ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n"
+		"       ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload | tcx_ingress\n"
+		"                        | tcx_egress }\n"
 		"       " HELP_SPEC_OPTIONS " }\n"
 		"\n"
 		"Note: Only xdp, tcx, tc, netkit, flow_dissector and netfilter attachments\n"
-- 
cgit v1.2.3


From 4f88dde0e1525ee69d29ba235c3965685439d0e3 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@gmail.com>
Date: Sun, 21 Jul 2024 22:42:38 +0800
Subject: bpftool: Add bash-completion for tcx subcommand

This commit adds bash-completion for attaching tcx program on interface.

Signed-off-by: Tao Chen <chen.dylane@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240721144238.96246-1-chen.dylane@gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/bpf/bpftool/bash-completion/bpftool | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index be99d49b8714..0c541498c301 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -1079,7 +1079,7 @@ _bpftool()
             esac
             ;;
         net)
-            local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload'
+            local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload tcx_ingress tcx_egress'
             case $command in
                 show|list)
                     [[ $prev != "$command" ]] && return 0
-- 
cgit v1.2.3


From 0d7c06125cea53b3d86d685d790b03b9ae9d6336 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@gmail.com>
Date: Sun, 21 Jul 2024 22:42:52 +0800
Subject: bpftool: Add document for net attach/detach on tcx subcommand

This commit adds sample output for net attach/detach on
tcx subcommand.

Signed-off-by: Tao Chen <chen.dylane@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240721144252.96264-1-chen.dylane@gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/bpf/bpftool/Documentation/bpftool-net.rst | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 348812881297..4a8cb5e0d94b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -29,7 +29,7 @@ NET COMMANDS
 | **bpftool** **net help**
 |
 | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
-| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** }
+| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** | **tcx_ingress** | **tcx_egress** }
 
 DESCRIPTION
 ===========
@@ -69,6 +69,8 @@ bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ]
     **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb;
     **xdpdrv** - Native XDP. runs earliest point in driver's receive path;
     **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception;
+    **tcx_ingress** - Ingress TCX. runs on ingress net traffic;
+    **tcx_egress** - Egress TCX. runs on egress net traffic;
 
 bpftool net detach *ATTACH_TYPE* dev *NAME*
     Detach bpf program attached to network interface *NAME* with type specified
@@ -178,3 +180,21 @@ EXAMPLES
 ::
 
       xdp:
+
+|
+| **# bpftool net attach tcx_ingress name tc_prog dev lo**
+| **# bpftool net**
+|
+
+::
+      tc:
+      lo(1) tcx/ingress tc_prog prog_id 29
+
+|
+| **# bpftool net attach tcx_ingress name tc_prog dev lo**
+| **# bpftool net detach tcx_ingress dev lo**
+| **# bpftool net**
+|
+
+::
+      tc:
-- 
cgit v1.2.3


From 0af02a8e356fc6d3b1eebb32fae7d35625127835 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2024 18:42:06 +0200
Subject: selftests/timers/posix_timers: Simplify error handling

No point in returning to main() on fatal errors. Just exit right away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
---
 tools/testing/selftests/timers/posix_timers.c | 151 +++++++++-----------------
 1 file changed, 52 insertions(+), 99 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 07c81c0093c0..38db82cf2a3a 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -10,6 +10,7 @@
 #include <sys/time.h>
 #include <stdio.h>
 #include <signal.h>
+#include <string.h>
 #include <unistd.h>
 #include <time.h>
 #include <pthread.h>
@@ -19,6 +20,20 @@
 #define DELAY 2
 #define USECS_PER_SEC 1000000
 
+static void __fatal_error(const char *test, const char *name, const char *what)
+{
+	char buf[64];
+
+	strerror_r(errno, buf, sizeof(buf));
+
+	if (name && strlen(name))
+		ksft_exit_fail_msg("%s %s %s %s\n", test, name, what, buf);
+	else
+		ksft_exit_fail_msg("%s %s %s\n", test, what, buf);
+}
+
+#define fatal_error(name, what)	__fatal_error(__func__, name, what)
+
 static volatile int done;
 
 /* Busy loop in userspace to elapse ITIMER_VIRTUAL */
@@ -74,24 +89,13 @@ static int check_diff(struct timeval start, struct timeval end)
 	return 0;
 }
 
-static int check_itimer(int which)
+static void check_itimer(int which, const char *name)
 {
-	const char *name;
-	int err;
 	struct timeval start, end;
 	struct itimerval val = {
 		.it_value.tv_sec = DELAY,
 	};
 
-	if (which == ITIMER_VIRTUAL)
-		name = "ITIMER_VIRTUAL";
-	else if (which == ITIMER_PROF)
-		name = "ITIMER_PROF";
-	else if (which == ITIMER_REAL)
-		name = "ITIMER_REAL";
-	else
-		return -1;
-
 	done = 0;
 
 	if (which == ITIMER_VIRTUAL)
@@ -101,17 +105,11 @@ static int check_itimer(int which)
 	else if (which == ITIMER_REAL)
 		signal(SIGALRM, sig_handler);
 
-	err = gettimeofday(&start, NULL);
-	if (err < 0) {
-		ksft_perror("Can't call gettimeofday()");
-		return -1;
-	}
+	if (gettimeofday(&start, NULL) < 0)
+		fatal_error(name, "gettimeofday()");
 
-	err = setitimer(which, &val, NULL);
-	if (err < 0) {
-		ksft_perror("Can't set timer");
-		return -1;
-	}
+	if (setitimer(which, &val, NULL) < 0)
+		fatal_error(name, "setitimer()");
 
 	if (which == ITIMER_VIRTUAL)
 		user_loop();
@@ -120,68 +118,41 @@ static int check_itimer(int which)
 	else if (which == ITIMER_REAL)
 		idle_loop();
 
-	err = gettimeofday(&end, NULL);
-	if (err < 0) {
-		ksft_perror("Can't call gettimeofday()");
-		return -1;
-	}
+	if (gettimeofday(&end, NULL) < 0)
+		fatal_error(name, "gettimeofday()");
 
 	ksft_test_result(check_diff(start, end) == 0, "%s\n", name);
-
-	return 0;
 }
 
-static int check_timer_create(int which)
+static void check_timer_create(int which, const char *name)
 {
-	const char *type;
-	int err;
-	timer_t id;
 	struct timeval start, end;
 	struct itimerspec val = {
 		.it_value.tv_sec = DELAY,
 	};
-
-	if (which == CLOCK_THREAD_CPUTIME_ID) {
-		type = "thread";
-	} else if (which == CLOCK_PROCESS_CPUTIME_ID) {
-		type = "process";
-	} else {
-		ksft_print_msg("Unknown timer_create() type %d\n", which);
-		return -1;
-	}
+	timer_t id;
 
 	done = 0;
-	err = timer_create(which, NULL, &id);
-	if (err < 0) {
-		ksft_perror("Can't create timer");
-		return -1;
-	}
-	signal(SIGALRM, sig_handler);
 
-	err = gettimeofday(&start, NULL);
-	if (err < 0) {
-		ksft_perror("Can't call gettimeofday()");
-		return -1;
-	}
+	if (timer_create(which, NULL, &id) < 0)
+		fatal_error(name, "timer_create()");
 
-	err = timer_settime(id, 0, &val, NULL);
-	if (err < 0) {
-		ksft_perror("Can't set timer");
-		return -1;
-	}
+	if (signal(SIGALRM, sig_handler) == SIG_ERR)
+		fatal_error(name, "signal()");
+
+	if (gettimeofday(&start, NULL) < 0)
+		fatal_error(name, "gettimeofday()");
+
+	if (timer_settime(id, 0, &val, NULL) < 0)
+		fatal_error(name, "timer_settime()");
 
 	user_loop();
 
-	err = gettimeofday(&end, NULL);
-	if (err < 0) {
-		ksft_perror("Can't call gettimeofday()");
-		return -1;
-	}
+	if (gettimeofday(&end, NULL) < 0)
+		fatal_error(name, "gettimeofday()");
 
 	ksft_test_result(check_diff(start, end) == 0,
-			 "timer_create() per %s\n", type);
-
-	return 0;
+			 "timer_create() per %s\n", name);
 }
 
 static pthread_t ctd_thread;
@@ -209,15 +180,14 @@ static void *ctd_thread_func(void *arg)
 
 	ctd_count = 100;
 	if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id))
-		return "Can't create timer\n";
+		fatal_error(NULL, "timer_create()");
 	if (timer_settime(id, 0, &val, NULL))
-		return "Can't set timer\n";
-
+		fatal_error(NULL, "timer_settime()");
 	while (ctd_count > 0 && !ctd_failed)
 		;
 
 	if (timer_delete(id))
-		return "Can't delete timer\n";
+		fatal_error(NULL, "timer_delete()");
 
 	return NULL;
 }
@@ -225,19 +195,16 @@ static void *ctd_thread_func(void *arg)
 /*
  * Test that only the running thread receives the timer signal.
  */
-static int check_timer_distribution(void)
+static void check_timer_distribution(void)
 {
-	const char *errmsg;
-
-	signal(SIGALRM, ctd_sighandler);
+	if (signal(SIGALRM, ctd_sighandler) == SIG_ERR)
+		fatal_error(NULL, "signal()");
 
-	errmsg = "Can't create thread\n";
 	if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL))
-		goto err;
+		fatal_error(NULL, "pthread_create()");
 
-	errmsg = "Can't join thread\n";
-	if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg)
-		goto err;
+	if (pthread_join(ctd_thread, NULL))
+		fatal_error(NULL, "pthread_join()");
 
 	if (!ctd_failed)
 		ksft_test_result_pass("check signal distribution\n");
@@ -245,10 +212,6 @@ static int check_timer_distribution(void)
 		ksft_test_result_fail("check signal distribution\n");
 	else
 		ksft_test_result_skip("check signal distribution (old kernel)\n");
-	return 0;
-err:
-	ksft_print_msg("%s", errmsg);
-	return -1;
 }
 
 int main(int argc, char **argv)
@@ -259,17 +222,10 @@ int main(int argc, char **argv)
 	ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
 	ksft_print_msg("based timers if other threads run on the CPU...\n");
 
-	if (check_itimer(ITIMER_VIRTUAL) < 0)
-		ksft_exit_fail();
-
-	if (check_itimer(ITIMER_PROF) < 0)
-		ksft_exit_fail();
-
-	if (check_itimer(ITIMER_REAL) < 0)
-		ksft_exit_fail();
-
-	if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0)
-		ksft_exit_fail();
+	check_itimer(ITIMER_VIRTUAL, "ITIMER_VIRTUAL");
+	check_itimer(ITIMER_PROF, "ITIMER_PROF");
+	check_itimer(ITIMER_REAL, "ITIMER_REAL");
+	check_timer_create(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
 
 	/*
 	 * It's unfortunately hard to reliably test a timer expiration
@@ -280,11 +236,8 @@ int main(int argc, char **argv)
 	 * to ensure true parallelism. So test only one thread until we
 	 * find a better solution.
 	 */
-	if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
-		ksft_exit_fail();
-
-	if (check_timer_distribution() < 0)
-		ksft_exit_fail();
+	check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
+	check_timer_distribution();
 
 	ksft_finished();
 }
-- 
cgit v1.2.3


From 45c4225c3dcc7db2c0cdbf889cc7a9c72a53f742 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2024 18:42:07 +0200
Subject: selftests/timers/posix_timers: Add SIG_IGN test

Add a test case to validate correct behaviour vs. SIG_IGN.

The posix specification states:

  "Setting a signal action to SIG_IGN for a signal that is pending shall
   cause the pending signal to be discarded, whether or not it is blocked."

The kernel implements this in the signal handling code, but due to the way
how posix timers are handling SIG_IGN for periodic timers, the behaviour
after installing a real handler again is inconsistent and suprising.

The following sequence is expected to deliver a signal:

  install_handler(SIG);
  block_signal(SIG);
  timer_create(...);	 <- Should send SIG
  timer_settime(value=100ms, interval=100ms);
  sleep(1);		 <- Timer expires and queues signal, timer is not rearmed
  			    as that should happen in the signal delivery path
  ignore_signal(SIG);	 <- Discards queued signal
  install_handler(SIG);  <- Restore handler, should rearm but does not
  sleep(1);
  unblock_signal(SIG);	 <- Should deliver one signal with overrun count
  			    set in siginfo

This fails because nothing rearms the timer when the signal handler is
restored. Add a test for this case which fails until the signal and posix
timer code is fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 tools/testing/selftests/timers/posix_timers.c | 127 +++++++++++++++++++++++++-
 1 file changed, 125 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 38db82cf2a3a..8a6139a1d27a 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -6,8 +6,9 @@
  *
  * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com>
  */
-
+#define _GNU_SOURCE
 #include <sys/time.h>
+#include <sys/types.h>
 #include <stdio.h>
 #include <signal.h>
 #include <string.h>
@@ -214,10 +215,129 @@ static void check_timer_distribution(void)
 		ksft_test_result_skip("check signal distribution (old kernel)\n");
 }
 
+struct tmrsig {
+	int	signals;
+	int	overruns;
+};
+
+static void siginfo_handler(int sig, siginfo_t *si, void *uc)
+{
+	struct tmrsig *tsig = si ? si->si_ptr : NULL;
+
+	if (tsig) {
+		tsig->signals++;
+		tsig->overruns += si->si_overrun;
+	}
+}
+
+static void *ignore_thread(void *arg)
+{
+	unsigned int *tid = arg;
+	sigset_t set;
+
+	sigemptyset(&set);
+	sigaddset(&set, SIGUSR1);
+	if (sigprocmask(SIG_BLOCK, &set, NULL))
+		fatal_error(NULL, "sigprocmask(SIG_BLOCK)");
+
+	*tid = gettid();
+	sleep(100);
+
+	if (sigprocmask(SIG_UNBLOCK, &set, NULL))
+		fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)");
+	return NULL;
+}
+
+static void check_sig_ign(int thread)
+{
+	struct tmrsig tsig = { };
+	struct itimerspec its;
+	unsigned int tid = 0;
+	struct sigaction sa;
+	struct sigevent sev;
+	pthread_t pthread;
+	timer_t timerid;
+	sigset_t set;
+
+	if (thread) {
+		if (pthread_create(&pthread, NULL, ignore_thread, &tid))
+			fatal_error(NULL, "pthread_create()");
+		sleep(1);
+	}
+
+	sa.sa_flags = SA_SIGINFO;
+	sa.sa_sigaction = siginfo_handler;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGUSR1, &sa, NULL))
+		fatal_error(NULL, "sigaction()");
+
+	/* Block the signal */
+	sigemptyset(&set);
+	sigaddset(&set, SIGUSR1);
+	if (sigprocmask(SIG_BLOCK, &set, NULL))
+		fatal_error(NULL, "sigprocmask(SIG_BLOCK)");
+
+	memset(&sev, 0, sizeof(sev));
+	sev.sigev_notify = SIGEV_SIGNAL;
+	sev.sigev_signo = SIGUSR1;
+	sev.sigev_value.sival_ptr = &tsig;
+	if (thread) {
+		sev.sigev_notify = SIGEV_THREAD_ID;
+		sev._sigev_un._tid = tid;
+	}
+
+	if (timer_create(CLOCK_MONOTONIC, &sev, &timerid))
+		fatal_error(NULL, "timer_create()");
+
+	/* Start the timer to expire in 100ms and 100ms intervals */
+	its.it_value.tv_sec = 0;
+	its.it_value.tv_nsec = 100000000;
+	its.it_interval.tv_sec = 0;
+	its.it_interval.tv_nsec = 100000000;
+	timer_settime(timerid, 0, &its, NULL);
+
+	sleep(1);
+
+	/* Set the signal to be ignored */
+	if (signal(SIGUSR1, SIG_IGN) == SIG_ERR)
+		fatal_error(NULL, "signal(SIG_IGN)");
+
+	sleep(1);
+
+	if (thread) {
+		/* Stop the thread first. No signal should be delivered to it */
+		if (pthread_cancel(pthread))
+			fatal_error(NULL, "pthread_cancel()");
+		if (pthread_join(pthread, NULL))
+			fatal_error(NULL, "pthread_join()");
+	}
+
+	/* Restore the handler */
+	if (sigaction(SIGUSR1, &sa, NULL))
+		fatal_error(NULL, "sigaction()");
+
+	sleep(1);
+
+	/* Unblock it, which should deliver the signal in the !thread case*/
+	if (sigprocmask(SIG_UNBLOCK, &set, NULL))
+		fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)");
+
+	if (timer_delete(timerid))
+		fatal_error(NULL, "timer_delete()");
+
+	if (!thread) {
+		ksft_test_result(tsig.signals == 1 && tsig.overruns == 29,
+				 "check_sig_ign SIGEV_SIGNAL\n");
+	} else {
+		ksft_test_result(tsig.signals == 0 && tsig.overruns == 0,
+				 "check_sig_ign SIGEV_THREAD_ID\n");
+	}
+}
+
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(6);
+	ksft_set_plan(8);
 
 	ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
 	ksft_print_msg("based timers if other threads run on the CPU...\n");
@@ -239,5 +359,8 @@ int main(int argc, char **argv)
 	check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
 	check_timer_distribution();
 
+	check_sig_ign(0);
+	check_sig_ign(1);
+
 	ksft_finished();
 }
-- 
cgit v1.2.3


From e65bb03e442751ccf1631382516dcca5b3a20122 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2024 18:42:09 +0200
Subject: selftests/timers/posix_timers: Validate signal rules

Add a test case to validate correct behaviour vs. timer reprogramming and
deletion.

The handling of queued signals in case of timer reprogramming or deletion
is inconsistent at best.

POSIX does not really specify the behaviour for that:

 - "The effect of disarming or resetting a timer with pending expiration
   notifications is unspecified."

 - "The disposition of pending signals for the deleted timer is
    unspecified."

In both cases it is reasonable to expect that pending signals are
discarded. Especially in the reprogramming case it does not make sense to
account for previous overruns or to deliver a signal for a timer which
has been disarmed.

Add tests to validate that no unexpected signals are delivered. They fail
for now until the signal and posix timer code is updated.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 tools/testing/selftests/timers/posix_timers.c | 108 +++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 8a6139a1d27a..41b43d1327eb 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -334,10 +334,114 @@ static void check_sig_ign(int thread)
 	}
 }
 
+static void check_rearm(void)
+{
+	struct tmrsig tsig = { };
+	struct itimerspec its;
+	struct sigaction sa;
+	struct sigevent sev;
+	timer_t timerid;
+	sigset_t set;
+
+	sa.sa_flags = SA_SIGINFO;
+	sa.sa_sigaction = siginfo_handler;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGUSR1, &sa, NULL))
+		fatal_error(NULL, "sigaction()");
+
+	/* Block the signal */
+	sigemptyset(&set);
+	sigaddset(&set, SIGUSR1);
+	if (sigprocmask(SIG_BLOCK, &set, NULL))
+		fatal_error(NULL, "sigprocmask(SIG_BLOCK)");
+
+	memset(&sev, 0, sizeof(sev));
+	sev.sigev_notify = SIGEV_SIGNAL;
+	sev.sigev_signo = SIGUSR1;
+	sev.sigev_value.sival_ptr = &tsig;
+	if (timer_create(CLOCK_MONOTONIC, &sev, &timerid))
+		fatal_error(NULL, "timer_create()");
+
+	/* Start the timer to expire in 100ms and 100ms intervals */
+	its.it_value.tv_sec = 0;
+	its.it_value.tv_nsec = 100000000;
+	its.it_interval.tv_sec = 0;
+	its.it_interval.tv_nsec = 100000000;
+	if (timer_settime(timerid, 0, &its, NULL))
+		fatal_error(NULL, "timer_settime()");
+
+	sleep(1);
+
+	/* Reprogram the timer to single shot */
+	its.it_value.tv_sec = 10;
+	its.it_value.tv_nsec = 0;
+	its.it_interval.tv_sec = 0;
+	its.it_interval.tv_nsec = 0;
+	if (timer_settime(timerid, 0, &its, NULL))
+		fatal_error(NULL, "timer_settime()");
+
+	/* Unblock it, which should not deliver a signal */
+	if (sigprocmask(SIG_UNBLOCK, &set, NULL))
+		fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)");
+
+	if (timer_delete(timerid))
+		fatal_error(NULL, "timer_delete()");
+
+	ksft_test_result(!tsig.signals, "check_rearm\n");
+}
+
+static void check_delete(void)
+{
+	struct tmrsig tsig = { };
+	struct itimerspec its;
+	struct sigaction sa;
+	struct sigevent sev;
+	timer_t timerid;
+	sigset_t set;
+
+	sa.sa_flags = SA_SIGINFO;
+	sa.sa_sigaction = siginfo_handler;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGUSR1, &sa, NULL))
+		fatal_error(NULL, "sigaction()");
+
+	/* Block the signal */
+	sigemptyset(&set);
+	sigaddset(&set, SIGUSR1);
+	if (sigprocmask(SIG_BLOCK, &set, NULL))
+		fatal_error(NULL, "sigprocmask(SIG_BLOCK)");
+
+	memset(&sev, 0, sizeof(sev));
+	sev.sigev_notify = SIGEV_SIGNAL;
+	sev.sigev_signo = SIGUSR1;
+	sev.sigev_value.sival_ptr = &tsig;
+	if (timer_create(CLOCK_MONOTONIC, &sev, &timerid))
+		fatal_error(NULL, "timer_create()");
+
+	/* Start the timer to expire in 100ms and 100ms intervals */
+	its.it_value.tv_sec = 0;
+	its.it_value.tv_nsec = 100000000;
+	its.it_interval.tv_sec = 0;
+	its.it_interval.tv_nsec = 100000000;
+	if (timer_settime(timerid, 0, &its, NULL))
+		fatal_error(NULL, "timer_settime()");
+
+	sleep(1);
+
+	if (timer_delete(timerid))
+		fatal_error(NULL, "timer_delete()");
+
+	/* Unblock it, which should not deliver a signal */
+	if (sigprocmask(SIG_UNBLOCK, &set, NULL))
+		fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)");
+
+	ksft_test_result(!tsig.signals, "check_delete\n");
+}
+
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(8);
+	ksft_set_plan(10);
 
 	ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
 	ksft_print_msg("based timers if other threads run on the CPU...\n");
@@ -361,6 +465,8 @@ int main(int argc, char **argv)
 
 	check_sig_ign(0);
 	check_sig_ign(1);
+	check_rearm();
+	check_delete();
 
 	ksft_finished();
 }
-- 
cgit v1.2.3


From 2c2b56132bb74b6b801dcb82f57489ae1cf81a91 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2024 18:42:10 +0200
Subject: selftests/timers/posix-timers: Validate SIGEV_NONE

Posix timers with a delivery mode of SIGEV_NONE deliver no signals but the
remaining expiry time must be readable via timer_gettime() for both one
shot and interval timers.

That's implemented correctly for regular posix timers but broken for posix
CPU timers.

Add a self test so the fixes can be verified.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 tools/testing/selftests/timers/posix_timers.c | 53 ++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 41b43d1327eb..097a132615f6 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -11,6 +11,7 @@
 #include <sys/types.h>
 #include <stdio.h>
 #include <signal.h>
+#include <stdint.h>
 #include <string.h>
 #include <unistd.h>
 #include <time.h>
@@ -20,6 +21,7 @@
 
 #define DELAY 2
 #define USECS_PER_SEC 1000000
+#define NSECS_PER_SEC 1000000000
 
 static void __fatal_error(const char *test, const char *name, const char *what)
 {
@@ -438,10 +440,57 @@ static void check_delete(void)
 	ksft_test_result(!tsig.signals, "check_delete\n");
 }
 
+static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2)
+{
+	int64_t diff;
+
+	diff = NSECS_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec);
+	diff += ((int) t1.tv_nsec - (int) t2.tv_nsec);
+	return diff;
+}
+
+static void check_sigev_none(int which, const char *name)
+{
+	struct timespec start, now;
+	struct itimerspec its;
+	struct sigevent sev;
+	timer_t timerid;
+
+	memset(&sev, 0, sizeof(sev));
+	sev.sigev_notify = SIGEV_NONE;
+
+	if (timer_create(which, &sev, &timerid))
+		fatal_error(name, "timer_create()");
+
+	/* Start the timer to expire in 100ms and 100ms intervals */
+	its.it_value.tv_sec = 0;
+	its.it_value.tv_nsec = 100000000;
+	its.it_interval.tv_sec = 0;
+	its.it_interval.tv_nsec = 100000000;
+	timer_settime(timerid, 0, &its, NULL);
+
+	if (clock_gettime(which, &start))
+		fatal_error(name, "clock_gettime()");
+
+	do {
+		if (clock_gettime(which, &now))
+			fatal_error(name, "clock_gettime()");
+	} while (calcdiff_ns(now, start) < NSECS_PER_SEC);
+
+	if (timer_gettime(timerid, &its))
+		fatal_error(name, "timer_gettime()");
+
+	if (timer_delete(timerid))
+		fatal_error(name, "timer_delete()");
+
+	ksft_test_result(its.it_value.tv_sec || its.it_value.tv_nsec,
+			 "check_sigev_none %s\n", name);
+}
+
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(10);
+	ksft_set_plan(12);
 
 	ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
 	ksft_print_msg("based timers if other threads run on the CPU...\n");
@@ -467,6 +516,8 @@ int main(int argc, char **argv)
 	check_sig_ign(1);
 	check_rearm();
 	check_delete();
+	check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
+	check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
 
 	ksft_finished();
 }
-- 
cgit v1.2.3


From f924f868ed05d954d79db419e97ba11293110d52 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2024 18:42:11 +0200
Subject: selftests/timers/posix-timers: Validate timer_gettime()

timer_gettime() must return the correct expiry time for interval timers
even when the timer is not armed, which is the case when a signal is
pending but blocked.

Works correctly for regular posix timers, but not for posix CPU timers.

Add a selftest to validate the fixes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 tools/testing/selftests/timers/posix_timers.c | 58 ++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 097a132615f6..4c993dbbdbf2 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -487,10 +487,63 @@ static void check_sigev_none(int which, const char *name)
 			 "check_sigev_none %s\n", name);
 }
 
+static void check_gettime(int which, const char *name)
+{
+	struct itimerspec its, prev;
+	struct timespec start, now;
+	struct sigevent sev;
+	timer_t timerid;
+	int wraps = 0;
+	sigset_t set;
+
+	/* Block the signal */
+	sigemptyset(&set);
+	sigaddset(&set, SIGUSR1);
+	if (sigprocmask(SIG_BLOCK, &set, NULL))
+		fatal_error(name, "sigprocmask(SIG_BLOCK)");
+
+	memset(&sev, 0, sizeof(sev));
+	sev.sigev_notify = SIGEV_SIGNAL;
+	sev.sigev_signo = SIGUSR1;
+
+	if (timer_create(which, &sev, &timerid))
+		fatal_error(name, "timer_create()");
+
+	/* Start the timer to expire in 100ms and 100ms intervals */
+	its.it_value.tv_sec = 0;
+	its.it_value.tv_nsec = 100000000;
+	its.it_interval.tv_sec = 0;
+	its.it_interval.tv_nsec = 100000000;
+	if (timer_settime(timerid, 0, &its, NULL))
+		fatal_error(name, "timer_settime()");
+
+	if (timer_gettime(timerid, &prev))
+		fatal_error(name, "timer_gettime()");
+
+	if (clock_gettime(which, &start))
+		fatal_error(name, "clock_gettime()");
+
+	do {
+		if (clock_gettime(which, &now))
+			fatal_error(name, "clock_gettime()");
+		if (timer_gettime(timerid, &its))
+			fatal_error(name, "timer_gettime()");
+		if (its.it_value.tv_nsec > prev.it_value.tv_nsec)
+			wraps++;
+		prev = its;
+
+	} while (calcdiff_ns(now, start) < NSECS_PER_SEC);
+
+	if (timer_delete(timerid))
+		fatal_error(name, "timer_delete()");
+
+	ksft_test_result(wraps > 1, "check_gettime %s\n", name);
+}
+
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(12);
+	ksft_set_plan(15);
 
 	ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
 	ksft_print_msg("based timers if other threads run on the CPU...\n");
@@ -518,6 +571,9 @@ int main(int argc, char **argv)
 	check_delete();
 	check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
 	check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
+	check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
+	check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
+	check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
 
 	ksft_finished();
 }
-- 
cgit v1.2.3


From 73339b82f86521eeadbb781d8d7e04813cbd0998 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2024 18:42:12 +0200
Subject: selftests/timers/posix-timers: Validate overrun after unblock

When a timer signal is blocked and later unblocked then one signal should
be delivered with the correct number of overruns since the timer was queued.

Validate that behaviour.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 tools/testing/selftests/timers/posix_timers.c | 61 ++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 4c993dbbdbf2..16bd49492efa 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -540,10 +540,66 @@ static void check_gettime(int which, const char *name)
 	ksft_test_result(wraps > 1, "check_gettime %s\n", name);
 }
 
+static void check_overrun(int which, const char *name)
+{
+	struct timespec start, now;
+	struct tmrsig tsig = { };
+	struct itimerspec its;
+	struct sigaction sa;
+	struct sigevent sev;
+	timer_t timerid;
+	sigset_t set;
+
+	sa.sa_flags = SA_SIGINFO;
+	sa.sa_sigaction = siginfo_handler;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGUSR1, &sa, NULL))
+		fatal_error(name, "sigaction()");
+
+	/* Block the signal */
+	sigemptyset(&set);
+	sigaddset(&set, SIGUSR1);
+	if (sigprocmask(SIG_BLOCK, &set, NULL))
+		fatal_error(name, "sigprocmask(SIG_BLOCK)");
+
+	memset(&sev, 0, sizeof(sev));
+	sev.sigev_notify = SIGEV_SIGNAL;
+	sev.sigev_signo = SIGUSR1;
+	sev.sigev_value.sival_ptr = &tsig;
+	if (timer_create(which, &sev, &timerid))
+		fatal_error(name, "timer_create()");
+
+	/* Start the timer to expire in 100ms and 100ms intervals */
+	its.it_value.tv_sec = 0;
+	its.it_value.tv_nsec = 100000000;
+	its.it_interval.tv_sec = 0;
+	its.it_interval.tv_nsec = 100000000;
+	if (timer_settime(timerid, 0, &its, NULL))
+		fatal_error(name, "timer_settime()");
+
+	if (clock_gettime(which, &start))
+		fatal_error(name, "clock_gettime()");
+
+	do {
+		if (clock_gettime(which, &now))
+			fatal_error(name, "clock_gettime()");
+	} while (calcdiff_ns(now, start) < NSECS_PER_SEC);
+
+	/* Unblock it, which should deliver a signal */
+	if (sigprocmask(SIG_UNBLOCK, &set, NULL))
+		fatal_error(name, "sigprocmask(SIG_UNBLOCK)");
+
+	if (timer_delete(timerid))
+		fatal_error(name, "timer_delete()");
+
+	ksft_test_result(tsig.signals == 1 && tsig.overruns == 9,
+			 "check_overrun %s\n", name);
+}
+
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(15);
+	ksft_set_plan(18);
 
 	ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
 	ksft_print_msg("based timers if other threads run on the CPU...\n");
@@ -574,6 +630,9 @@ int main(int argc, char **argv)
 	check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
 	check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
 	check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
+	check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
+	check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
+	check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
 
 	ksft_finished();
 }
-- 
cgit v1.2.3


From e44b4fc40cb429f6dedc4518a16221115035f654 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Mon, 22 Jul 2024 11:30:46 -0700
Subject: selftests/bpf: Fix the missing tramp_1 to tramp_40 ops in cfi_stubs

The tramp_1 to tramp_40 ops is not set in the cfi_stubs in the
bpf_testmod_ops. It fails the struct_ops_multi_pages test after
retiring the unsupported_ops in the earlier patch.

This patch initializes them in a loop during the bpf_testmod_init().

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240722183049.2254692-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index fd28c1157bd3..3687a40b61c6 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -1024,6 +1024,11 @@ static void bpf_testmod_test_2(int a, int b)
 {
 }
 
+static int bpf_testmod_tramp(int value)
+{
+	return 0;
+}
+
 static int bpf_testmod_ops__test_maybe_null(int dummy,
 					    struct task_struct *task__nullable)
 {
@@ -1080,6 +1085,7 @@ static int bpf_testmod_init(void)
 			.kfunc_btf_id	= bpf_testmod_dtor_ids[1]
 		},
 	};
+	void **tramp;
 	int ret;
 
 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_testmod_common_kfunc_set);
@@ -1103,6 +1109,14 @@ static int bpf_testmod_init(void)
 	ret = register_bpf_testmod_uprobe();
 	if (ret < 0)
 		return ret;
+
+	/* Ensure nothing is between tramp_1..tramp_40 */
+	BUILD_BUG_ON(offsetof(struct bpf_testmod_ops, tramp_1) + 40 * sizeof(long) !=
+		     offsetofend(struct bpf_testmod_ops, tramp_40));
+	tramp = (void **)&__bpf_testmod_ops.tramp_1;
+	while (tramp <= (void **)&__bpf_testmod_ops.tramp_40)
+		*tramp++ = bpf_testmod_tramp;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4009c95fede6b783802ad01f264a7a0541f5ea60 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Mon, 22 Jul 2024 11:30:47 -0700
Subject: selftests/bpf: Ensure the unsupported struct_ops prog cannot be
 loaded

There is an existing "bpf_tcp_ca/unsupp_cong_op" test to ensure
the unsupported tcp-cc "get_info" struct_ops prog cannot be loaded.

This patch adds a new test in the bpf_testmod such that the
unsupported ops test does not depend on other kernel subsystem
where its supporting ops may be changed in the future.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240722183049.2254692-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.h        |  1 +
 .../bpf/prog_tests/test_struct_ops_module.c        |  2 ++
 .../testing/selftests/bpf/progs/unsupported_ops.c  | 22 ++++++++++++++++++++++
 3 files changed, 25 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/unsupported_ops.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
index 23fa1872ee67..fe0d402b0d65 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
@@ -35,6 +35,7 @@ struct bpf_testmod_ops {
 	void (*test_2)(int a, int b);
 	/* Used to test nullable arguments. */
 	int (*test_maybe_null)(int dummy, struct task_struct *task);
+	int (*unsupported_ops)(void);
 
 	/* The following fields are used to test shadow copies. */
 	char onebyte;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
index bbcf12696a6b..75a0dea511b3 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
@@ -9,6 +9,7 @@
 #include "struct_ops_nulled_out_cb.skel.h"
 #include "struct_ops_forgotten_cb.skel.h"
 #include "struct_ops_detach.skel.h"
+#include "unsupported_ops.skel.h"
 
 static void check_map_info(struct bpf_map_info *info)
 {
@@ -311,5 +312,6 @@ void serial_test_struct_ops_module(void)
 		test_struct_ops_forgotten_cb();
 	if (test__start_subtest("test_detach_link"))
 		test_detach_link();
+	RUN_TESTS(unsupported_ops);
 }
 
diff --git a/tools/testing/selftests/bpf/progs/unsupported_ops.c b/tools/testing/selftests/bpf/progs/unsupported_ops.c
new file mode 100644
index 000000000000..9180365a3568
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/unsupported_ops.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops/unsupported_ops")
+__failure
+__msg("attach to unsupported member unsupported_ops of struct bpf_testmod_ops")
+int BPF_PROG(unsupported_ops)
+{
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops testmod = {
+	.unsupported_ops = (void *)unsupported_ops,
+};
-- 
cgit v1.2.3


From 4dc7556490d77d5046844e7731d5d0a3055a3448 Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Fri, 19 Jul 2024 19:00:56 +0800
Subject: selftests/bpf: Avoid load failure for token_lsm.c

The compiler optimized the two bpf progs in token_lsm.c to make return
value from the bool variable in the "return -1" path, causing an
unexpected rejection:

0: R1=ctx() R10=fp0
; int BPF_PROG(bpf_token_capable, struct bpf_token *token, int cap) @ bpf_lsm.c:17
0: (b7) r6 = 0                        ; R6_w=0
; if (my_pid == 0 || my_pid != (bpf_get_current_pid_tgid() >> 32)) @ bpf_lsm.c:19
1: (18) r1 = 0xffffc9000102a000       ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5)
3: (61) r7 = *(u32 *)(r1 +0)          ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5) R7_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
4: (15) if r7 == 0x0 goto pc+11       ; R7_w=scalar(smin=umin=umin32=1,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
5: (67) r7 <<= 32                     ; R7_w=scalar(smax=0x7fffffff00000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))
6: (c7) r7 s>>= 32                    ; R7_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff)
7: (85) call bpf_get_current_pid_tgid#14      ; R0=scalar()
8: (77) r0 >>= 32                     ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
9: (5d) if r0 != r7 goto pc+6         ; R0_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) R7=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff))
; if (reject_capable) @ bpf_lsm.c:21
10: (18) r1 = 0xffffc9000102a004      ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5,off=4)
12: (71) r6 = *(u8 *)(r1 +0)          ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5,off=4) R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff))
;  @ bpf_lsm.c:0
13: (87) r6 = -r6                     ; R6_w=scalar()
14: (67) r6 <<= 56                    ; R6_w=scalar(smax=0x7f00000000000000,umax=0xff00000000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xff00000000000000))
15: (c7) r6 s>>= 56                   ; R6_w=scalar(smin=smin32=-128,smax=smax32=127)
; int BPF_PROG(bpf_token_capable, struct bpf_token *token, int cap) @ bpf_lsm.c:17
16: (bf) r0 = r6                      ; R0_w=scalar(id=1,smin=smin32=-128,smax=smax32=127) R6_w=scalar(id=1,smin=smin32=-128,smax=smax32=127)
17: (95) exit
At program exit the register R0 has smin=-128 smax=127 should have been in [-4095, 0]

To avoid this failure, change the variable type from bool to int.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/r/20240719110059.797546-7-xukuohai@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/progs/token_lsm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/token_lsm.c b/tools/testing/selftests/bpf/progs/token_lsm.c
index e4d59b6ba743..a6002d073b1b 100644
--- a/tools/testing/selftests/bpf/progs/token_lsm.c
+++ b/tools/testing/selftests/bpf/progs/token_lsm.c
@@ -8,8 +8,8 @@
 char _license[] SEC("license") = "GPL";
 
 int my_pid;
-bool reject_capable;
-bool reject_cmd;
+int reject_capable;
+int reject_cmd;
 
 SEC("lsm/bpf_token_capable")
 int BPF_PROG(token_capable, struct bpf_token *token, int cap)
-- 
cgit v1.2.3


From 2b23b6c0f03c94dbacff2abdfd45490eca9d7f6e Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Fri, 19 Jul 2024 19:00:57 +0800
Subject: selftests/bpf: Add return value checks for failed tests

The return ranges of some bpf lsm test progs can not be deduced by
the verifier accurately. To avoid erroneous rejections, add explicit
return value checks for these progs.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/r/20240719110059.797546-8-xukuohai@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/progs/err.h                      | 10 ++++++++++
 tools/testing/selftests/bpf/progs/test_sig_in_xattr.c        |  4 ++++
 tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c    |  8 ++++++--
 tools/testing/selftests/bpf/progs/verifier_global_subprogs.c |  7 ++++++-
 4 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/err.h b/tools/testing/selftests/bpf/progs/err.h
index d66d283d9e59..38529779a236 100644
--- a/tools/testing/selftests/bpf/progs/err.h
+++ b/tools/testing/selftests/bpf/progs/err.h
@@ -5,6 +5,16 @@
 #define MAX_ERRNO 4095
 #define IS_ERR_VALUE(x) (unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO
 
+#define __STR(x) #x
+
+#define set_if_not_errno_or_zero(x, y)			\
+({							\
+	asm volatile ("if %0 s< -4095 goto +1\n"	\
+		      "if %0 s<= 0 goto +1\n"		\
+		      "%0 = " __STR(y) "\n"		\
+		      : "+r"(x));			\
+})
+
 static inline int IS_ERR_OR_NULL(const void *ptr)
 {
 	return !ptr || IS_ERR_VALUE((unsigned long)ptr);
diff --git a/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c b/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c
index 2f0eb1334d65..8ef6b39335b6 100644
--- a/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c
+++ b/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c
@@ -6,6 +6,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_kfuncs.h"
+#include "err.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -79,5 +80,8 @@ int BPF_PROG(test_file_open, struct file *f)
 	ret = bpf_verify_pkcs7_signature(&digest_ptr, &sig_ptr, trusted_keyring);
 
 	bpf_key_put(trusted_keyring);
+
+	set_if_not_errno_or_zero(ret, -EFAULT);
+
 	return ret;
 }
diff --git a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
index f42e9f3831a1..12034a73ee2d 100644
--- a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
+++ b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
@@ -11,6 +11,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_kfuncs.h"
+#include "err.h"
 
 #define MAX_DATA_SIZE (1024 * 1024)
 #define MAX_SIG_SIZE 1024
@@ -55,12 +56,12 @@ int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size)
 
 	ret = bpf_probe_read_kernel(&value, sizeof(value), &attr->value);
 	if (ret)
-		return ret;
+		goto out;
 
 	ret = bpf_copy_from_user(data_val, sizeof(struct data),
 				 (void *)(unsigned long)value);
 	if (ret)
-		return ret;
+		goto out;
 
 	if (data_val->data_len > sizeof(data_val->data))
 		return -EINVAL;
@@ -84,5 +85,8 @@ int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size)
 
 	bpf_key_put(trusted_keyring);
 
+out:
+	set_if_not_errno_or_zero(ret, -EFAULT);
+
 	return ret;
 }
diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
index a9fc30ed4d73..20904cd2baa2 100644
--- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
+++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
@@ -7,6 +7,7 @@
 #include "bpf_misc.h"
 #include "xdp_metadata.h"
 #include "bpf_kfuncs.h"
+#include "err.h"
 
 /* The compiler may be able to detect the access to uninitialized
    memory in the routines performing out of bound memory accesses and
@@ -331,7 +332,11 @@ SEC("?lsm/bpf")
 __success __log_level(2)
 int BPF_PROG(arg_tag_ctx_lsm)
 {
-	return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx);
+	int ret;
+
+	ret = tracing_subprog_void(ctx) + tracing_subprog_u64(ctx);
+	set_if_not_errno_or_zero(ret, -1);
+	return ret;
 }
 
 SEC("?struct_ops/test_1")
-- 
cgit v1.2.3


From d463dd9c9aa24b17ccb8ed76bdd7768baf857b48 Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Fri, 19 Jul 2024 19:00:58 +0800
Subject: selftests/bpf: Add test for lsm tail call

Add test for lsm tail call to ensure tail call can only be used between
bpf lsm progs attached to the same hook.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/r/20240719110059.797546-9-xukuohai@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/test_lsm.c | 46 ++++++++++++++++++++++-
 tools/testing/selftests/bpf/progs/lsm_tailcall.c  | 34 +++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/lsm_tailcall.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
index 16175d579bc7..2a27f3714f5c 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 
 #include "lsm.skel.h"
+#include "lsm_tailcall.skel.h"
 
 char *CMD_ARGS[] = {"true", NULL};
 
@@ -95,7 +96,7 @@ static int test_lsm(struct lsm *skel)
 	return 0;
 }
 
-void test_test_lsm(void)
+static void test_lsm_basic(void)
 {
 	struct lsm *skel = NULL;
 	int err;
@@ -114,3 +115,46 @@ void test_test_lsm(void)
 close_prog:
 	lsm__destroy(skel);
 }
+
+static void test_lsm_tailcall(void)
+{
+	struct lsm_tailcall *skel = NULL;
+	int map_fd, prog_fd;
+	int err, key;
+
+	skel = lsm_tailcall__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "lsm_tailcall__skel_load"))
+		goto close_prog;
+
+	map_fd = bpf_map__fd(skel->maps.jmp_table);
+	if (CHECK_FAIL(map_fd < 0))
+		goto close_prog;
+
+	prog_fd = bpf_program__fd(skel->progs.lsm_file_permission_prog);
+	if (CHECK_FAIL(prog_fd < 0))
+		goto close_prog;
+
+	key = 0;
+	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
+	if (CHECK_FAIL(!err))
+		goto close_prog;
+
+	prog_fd = bpf_program__fd(skel->progs.lsm_file_alloc_security_prog);
+	if (CHECK_FAIL(prog_fd < 0))
+		goto close_prog;
+
+	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
+	if (CHECK_FAIL(err))
+		goto close_prog;
+
+close_prog:
+	lsm_tailcall__destroy(skel);
+}
+
+void test_test_lsm(void)
+{
+	if (test__start_subtest("lsm_basic"))
+		test_lsm_basic();
+	if (test__start_subtest("lsm_tailcall"))
+		test_lsm_tailcall();
+}
diff --git a/tools/testing/selftests/bpf/progs/lsm_tailcall.c b/tools/testing/selftests/bpf/progs/lsm_tailcall.c
new file mode 100644
index 000000000000..49c075ce2d4c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/lsm_tailcall.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Huawei Technologies Co., Ltd */
+
+#include "vmlinux.h"
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+SEC("lsm/file_permission")
+int lsm_file_permission_prog(void *ctx)
+{
+	return 0;
+}
+
+SEC("lsm/file_alloc_security")
+int lsm_file_alloc_security_prog(void *ctx)
+{
+	return 0;
+}
+
+SEC("lsm/file_alloc_security")
+int lsm_file_alloc_security_entry(void *ctx)
+{
+	bpf_tail_call_static(ctx, &jmp_table, 0);
+	return 0;
+}
-- 
cgit v1.2.3


From 04d8243b1f83251bd599fdd3ea91e89039b6a557 Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Fri, 19 Jul 2024 19:00:59 +0800
Subject: selftests/bpf: Add verifier tests for bpf lsm

Add verifier tests to check bpf lsm return values and disabled hooks.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/r/20240719110059.797546-10-xukuohai@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c |   2 +
 tools/testing/selftests/bpf/progs/verifier_lsm.c  | 162 ++++++++++++++++++++++
 2 files changed, 164 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_lsm.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 9dc3687bc406..ff1c7da1d06e 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -88,6 +88,7 @@
 #include "verifier_xdp.skel.h"
 #include "verifier_xdp_direct_packet_access.skel.h"
 #include "verifier_bits_iter.skel.h"
+#include "verifier_lsm.skel.h"
 
 #define MAX_ENTRIES 11
 
@@ -206,6 +207,7 @@ void test_verifier_xadd(void)                 { RUN(verifier_xadd); }
 void test_verifier_xdp(void)                  { RUN(verifier_xdp); }
 void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); }
 void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); }
+void test_verifier_lsm(void)                  { RUN(verifier_lsm); }
 
 static int init_test_val_map(struct bpf_object *obj, char *map_name)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c
new file mode 100644
index 000000000000..32e5e779cb96
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+SEC("lsm/file_alloc_security")
+__description("lsm bpf prog with -4095~0 retval. test 1")
+__success
+__naked int errno_zero_retval_test1(void *ctx)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/file_alloc_security")
+__description("lsm bpf prog with -4095~0 retval. test 2")
+__success
+__naked int errno_zero_retval_test2(void *ctx)
+{
+	asm volatile (
+	"r0 = -4095;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/file_mprotect")
+__description("lsm bpf prog with -4095~0 retval. test 4")
+__failure __msg("R0 has smin=-4096 smax=-4096 should have been in [-4095, 0]")
+__naked int errno_zero_retval_test4(void *ctx)
+{
+	asm volatile (
+	"r0 = -4096;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/file_mprotect")
+__description("lsm bpf prog with -4095~0 retval. test 5")
+__failure __msg("R0 has smin=4096 smax=4096 should have been in [-4095, 0]")
+__naked int errno_zero_retval_test5(void *ctx)
+{
+	asm volatile (
+	"r0 = 4096;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/file_mprotect")
+__description("lsm bpf prog with -4095~0 retval. test 6")
+__failure __msg("R0 has smin=1 smax=1 should have been in [-4095, 0]")
+__naked int errno_zero_retval_test6(void *ctx)
+{
+	asm volatile (
+	"r0 = 1;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/audit_rule_known")
+__description("lsm bpf prog with bool retval. test 1")
+__success
+__naked int bool_retval_test1(void *ctx)
+{
+	asm volatile (
+	"r0 = 1;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/audit_rule_known")
+__description("lsm bpf prog with bool retval. test 2")
+__success
+__success
+__naked int bool_retval_test2(void *ctx)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/audit_rule_known")
+__description("lsm bpf prog with bool retval. test 3")
+__failure __msg("R0 has smin=-1 smax=-1 should have been in [0, 1]")
+__naked int bool_retval_test3(void *ctx)
+{
+	asm volatile (
+	"r0 = -1;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/audit_rule_known")
+__description("lsm bpf prog with bool retval. test 4")
+__failure __msg("R0 has smin=2 smax=2 should have been in [0, 1]")
+__naked int bool_retval_test4(void *ctx)
+{
+	asm volatile (
+	"r0 = 2;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/file_free_security")
+__success
+__description("lsm bpf prog with void retval. test 1")
+__naked int void_retval_test1(void *ctx)
+{
+	asm volatile (
+	"r0 = -4096;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/file_free_security")
+__success
+__description("lsm bpf prog with void retval. test 2")
+__naked int void_retval_test2(void *ctx)
+{
+	asm volatile (
+	"r0 = 4096;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/getprocattr")
+__description("lsm disabled hook: getprocattr")
+__failure __msg("points to disabled hook")
+__naked int disabled_hook_test1(void *ctx)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/setprocattr")
+__description("lsm disabled hook: setprocattr")
+__failure __msg("points to disabled hook")
+__naked int disabled_hook_test2(void *ctx)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("lsm/ismaclabel")
+__description("lsm disabled hook: ismaclabel")
+__failure __msg("points to disabled hook")
+__naked int disabled_hook_test3(void *ctx)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From aa8ebb270c66cea1f56a25d0f938036e91ad085a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 22 Jul 2024 19:08:15 -0700
Subject: selftests/bpf: Workaround strict bpf_lsm return value check.

test_progs-no_alu32 -t libbpf_get_fd_by_id_opts
is being rejected by the verifier with the following error
due to compiler optimization:

6: (67) r0 <<= 62                     ; R0_w=scalar(smax=0x4000000000000000,umax=0xc000000000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xc000000000000000))
7: (c7) r0 s>>= 63                    ; R0_w=scalar(smin=smin32=-1,smax=smax32=0)
;  @ test_libbpf_get_fd_by_id_opts.c:0
8: (57) r0 &= -13                     ; R0_w=scalar(smax=0x7ffffffffffffff3,umax=0xfffffffffffffff3,smax32=0x7ffffff3,umax32=0xfffffff3,var_off=(0x0; 0xfffffffffffffff3))
; int BPF_PROG(check_access, struct bpf_map *map, fmode_t fmode) @ test_libbpf_get_fd_by_id_opts.c:27
9: (95) exit
At program exit the register R0 has smax=9223372036854775795 should have been in [-4095, 0]

Workaround by adding barrier().
Eventually the verifier will be able to recognize it.

Fixes: 5d99e198be27 ("bpf, lsm: Add check for BPF LSM return value")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c b/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c
index f5ac5f3e8919..568816307f71 100644
--- a/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c
+++ b/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c
@@ -31,6 +31,7 @@ int BPF_PROG(check_access, struct bpf_map *map, fmode_t fmode)
 
 	if (fmode & FMODE_WRITE)
 		return -EACCES;
+	barrier();
 
 	return 0;
 }
-- 
cgit v1.2.3


From af994e31b75e30941ef32796d0aa8510db00b32a Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Sat, 20 Jul 2024 19:40:03 +0800
Subject: selftests/bpf: Drop make_client in sk_lookup

This patch uses the new helper connect_to_addr_str() in sk_lookup.c to
create the client socket and connect to the server, instead of using local
defined function make_client(). This local function can be dropped then.

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Link: https://lore.kernel.org/r/058199d7ab46802249dae066ca22c98f6be508ee.1721475357.git.tanggeliang@kylinos.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 34 +++++-----------------
 1 file changed, 7 insertions(+), 27 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index ae87c00867ba..9510a3f83787 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -229,27 +229,6 @@ fail:
 	return -1;
 }
 
-static int make_client(int sotype, const char *ip, int port)
-{
-	struct sockaddr_storage addr = {0};
-	int err, fd;
-
-	fd = make_socket(sotype, ip, port, &addr);
-	if (fd < 0)
-		return -1;
-
-	err = connect(fd, (void *)&addr, inetaddr_len(&addr));
-	if (CHECK(err, "make_client", "connect")) {
-		log_err("failed to connect client socket");
-		goto fail;
-	}
-
-	return fd;
-fail:
-	close(fd);
-	return -1;
-}
-
 static __u64 socket_cookie(int fd)
 {
 	__u64 cookie;
@@ -646,8 +625,9 @@ static void run_lookup_prog(const struct test *t)
 			goto close;
 	}
 
-	client_fd = make_client(t->sotype, t->connect_to.ip, t->connect_to.port);
-	if (client_fd < 0)
+	client_fd = connect_to_addr_str(is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET,
+					t->sotype, t->connect_to.ip, t->connect_to.port, NULL);
+	if (!ASSERT_OK_FD(client_fd, "connect_to_addr_str"))
 		goto close;
 
 	if (t->sotype == SOCK_STREAM)
@@ -1152,8 +1132,8 @@ static void run_sk_assign_connected(struct test_sk_lookup *skel,
 	if (server_fd < 0)
 		return;
 
-	connected_fd = make_client(sotype, EXT_IP4, EXT_PORT);
-	if (connected_fd < 0)
+	connected_fd = connect_to_addr_str(AF_INET, sotype, EXT_IP4, EXT_PORT, NULL);
+	if (!ASSERT_OK_FD(connected_fd, "connect_to_addr_str"))
 		goto out_close_server;
 
 	/* Put a connected socket in redirect map */
@@ -1166,8 +1146,8 @@ static void run_sk_assign_connected(struct test_sk_lookup *skel,
 		goto out_close_connected;
 
 	/* Try to redirect TCP SYN / UDP packet to a connected socket */
-	client_fd = make_client(sotype, EXT_IP4, EXT_PORT);
-	if (client_fd < 0)
+	client_fd = connect_to_addr_str(AF_INET, sotype, EXT_IP4, EXT_PORT, NULL);
+	if (!ASSERT_OK_FD(client_fd, "connect_to_addr_str"))
 		goto out_unlink_prog;
 	if (sotype == SOCK_DGRAM) {
 		send_byte(client_fd);
-- 
cgit v1.2.3


From 01c2f776ed372155f9e80bd787324f26601914de Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Sat, 20 Jul 2024 19:40:04 +0800
Subject: selftests/bpf: Drop make_socket in sk_lookup

This patch uses the public network helers client_socket() + make_sockaddr()
in sk_lookup.c to create the client socket, set the timeout sockopts, and
make the connecting address. The local defined function make_socket()
can be dropped then.

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Link: https://lore.kernel.org/r/588771977ac48c27f73526d8421a84b91d7cf218.1721475357.git.tanggeliang@kylinos.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 61 ++++++----------------
 1 file changed, 17 insertions(+), 44 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index 9510a3f83787..88f2f9efeea5 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -47,8 +47,6 @@
 #define INT_IP6		"fd00::2"
 #define INT_PORT	8008
 
-#define IO_TIMEOUT_SEC	3
-
 enum server {
 	SERVER_A = 0,
 	SERVER_B = 1,
@@ -114,40 +112,6 @@ static socklen_t inetaddr_len(const struct sockaddr_storage *addr)
 		addr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : 0);
 }
 
-static int make_socket(int sotype, const char *ip, int port,
-		       struct sockaddr_storage *addr)
-{
-	struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC };
-	int err, family, fd;
-
-	family = is_ipv6(ip) ? AF_INET6 : AF_INET;
-	err = make_sockaddr(family, ip, port, addr, NULL);
-	if (CHECK(err, "make_address", "failed\n"))
-		return -1;
-
-	fd = socket(addr->ss_family, sotype, 0);
-	if (CHECK(fd < 0, "socket", "failed\n")) {
-		log_err("failed to make socket");
-		return -1;
-	}
-
-	err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
-	if (CHECK(err, "setsockopt(SO_SNDTIMEO)", "failed\n")) {
-		log_err("failed to set SNDTIMEO");
-		close(fd);
-		return -1;
-	}
-
-	err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
-	if (CHECK(err, "setsockopt(SO_RCVTIMEO)", "failed\n")) {
-		log_err("failed to set RCVTIMEO");
-		close(fd);
-		return -1;
-	}
-
-	return fd;
-}
-
 static int setsockopts(int fd, void *opts)
 {
 	struct cb_opts *co = (struct cb_opts *)opts;
@@ -842,6 +806,7 @@ static void test_redirect_lookup(struct test_sk_lookup *skel)
 
 static void drop_on_lookup(const struct test *t)
 {
+	int family = is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET;
 	struct sockaddr_storage dst = {};
 	int client_fd, server_fd, err;
 	struct bpf_link *lookup_link;
@@ -856,11 +821,13 @@ static void drop_on_lookup(const struct test *t)
 	if (server_fd < 0)
 		goto detach;
 
-	client_fd = make_socket(t->sotype, t->connect_to.ip,
-				t->connect_to.port, &dst);
-	if (client_fd < 0)
+	client_fd = client_socket(family, t->sotype, NULL);
+	if (!ASSERT_OK_FD(client_fd, "client_socket"))
 		goto close_srv;
 
+	err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL);
+	if (!ASSERT_OK(err, "make_sockaddr"))
+		goto close_all;
 	err = connect(client_fd, (void *)&dst, inetaddr_len(&dst));
 	if (t->sotype == SOCK_DGRAM) {
 		err = send_byte(client_fd);
@@ -956,6 +923,7 @@ static void test_drop_on_lookup(struct test_sk_lookup *skel)
 
 static void drop_on_reuseport(const struct test *t)
 {
+	int family = is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET;
 	struct sockaddr_storage dst = { 0 };
 	int client, server1, server2, err;
 	struct bpf_link *lookup_link;
@@ -980,11 +948,13 @@ static void drop_on_reuseport(const struct test *t)
 	if (server2 < 0)
 		goto close_srv1;
 
-	client = make_socket(t->sotype, t->connect_to.ip,
-			     t->connect_to.port, &dst);
-	if (client < 0)
+	client = client_socket(family, t->sotype, NULL);
+	if (!ASSERT_OK_FD(client, "client_socket"))
 		goto close_srv2;
 
+	err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL);
+	if (!ASSERT_OK(err, "make_sockaddr"))
+		goto close_all;
 	err = connect(client, (void *)&dst, inetaddr_len(&dst));
 	if (t->sotype == SOCK_DGRAM) {
 		err = send_byte(client);
@@ -1228,10 +1198,13 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t)
 	if (err)
 		goto out_close_server;
 
-	client_fd = make_socket(SOCK_STREAM, EXT_IP4, EXT_PORT, &dst);
-	if (client_fd < 0)
+	client_fd = client_socket(AF_INET, SOCK_STREAM, NULL);
+	if (!ASSERT_OK_FD(client_fd, "client_socket"))
 		goto out_close_server;
 
+	err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, NULL);
+	if (!ASSERT_OK(err, "make_sockaddr"))
+		goto out_close_client;
 	err = connect(client_fd, (void *)&dst, inetaddr_len(&dst));
 	if (CHECK(err && !t->expect_errno, "connect",
 		  "unexpected error %d\n", errno))
-- 
cgit v1.2.3


From c3c41e016cca7cc6b6d9cd1322fdf9be845ccd06 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Sat, 20 Jul 2024 19:40:05 +0800
Subject: selftests/bpf: Drop inetaddr_len in sk_lookup

No need to use a dedicated helper inetaddr_len() to get the length of
the IPv4 or IPv6 address, it can be got by make_sockaddr(), this patch
drops it.

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Link: https://lore.kernel.org/r/32e2a4122921051da38a6e4fbb2ebee5f0af5a4e.1721475357.git.tanggeliang@kylinos.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index 88f2f9efeea5..96f3863da8bc 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -106,12 +106,6 @@ static int attach_reuseport(int sock_fd, struct bpf_program *reuseport_prog)
 	return 0;
 }
 
-static socklen_t inetaddr_len(const struct sockaddr_storage *addr)
-{
-	return (addr->ss_family == AF_INET ? sizeof(struct sockaddr_in) :
-		addr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : 0);
-}
-
 static int setsockopts(int fd, void *opts)
 {
 	struct cb_opts *co = (struct cb_opts *)opts;
@@ -810,6 +804,7 @@ static void drop_on_lookup(const struct test *t)
 	struct sockaddr_storage dst = {};
 	int client_fd, server_fd, err;
 	struct bpf_link *lookup_link;
+	socklen_t len;
 	ssize_t n;
 
 	lookup_link = attach_lookup_prog(t->lookup_prog);
@@ -825,10 +820,10 @@ static void drop_on_lookup(const struct test *t)
 	if (!ASSERT_OK_FD(client_fd, "client_socket"))
 		goto close_srv;
 
-	err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL);
+	err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, &len);
 	if (!ASSERT_OK(err, "make_sockaddr"))
 		goto close_all;
-	err = connect(client_fd, (void *)&dst, inetaddr_len(&dst));
+	err = connect(client_fd, (void *)&dst, len);
 	if (t->sotype == SOCK_DGRAM) {
 		err = send_byte(client_fd);
 		if (err)
@@ -927,6 +922,7 @@ static void drop_on_reuseport(const struct test *t)
 	struct sockaddr_storage dst = { 0 };
 	int client, server1, server2, err;
 	struct bpf_link *lookup_link;
+	socklen_t len;
 	ssize_t n;
 
 	lookup_link = attach_lookup_prog(t->lookup_prog);
@@ -952,10 +948,10 @@ static void drop_on_reuseport(const struct test *t)
 	if (!ASSERT_OK_FD(client, "client_socket"))
 		goto close_srv2;
 
-	err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL);
+	err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, &len);
 	if (!ASSERT_OK(err, "make_sockaddr"))
 		goto close_all;
-	err = connect(client, (void *)&dst, inetaddr_len(&dst));
+	err = connect(client, (void *)&dst, len);
 	if (t->sotype == SOCK_DGRAM) {
 		err = send_byte(client);
 		if (err)
@@ -1169,6 +1165,7 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t)
 	int map_fd, server_fd, client_fd;
 	struct bpf_link *link1, *link2;
 	int prog_idx, done, err;
+	socklen_t len;
 
 	map_fd = bpf_map__fd(t->run_map);
 
@@ -1202,10 +1199,10 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t)
 	if (!ASSERT_OK_FD(client_fd, "client_socket"))
 		goto out_close_server;
 
-	err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, NULL);
+	err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, &len);
 	if (!ASSERT_OK(err, "make_sockaddr"))
 		goto out_close_client;
-	err = connect(client_fd, (void *)&dst, inetaddr_len(&dst));
+	err = connect(client_fd, (void *)&dst, len);
 	if (CHECK(err && !t->expect_errno, "connect",
 		  "unexpected error %d\n", errno))
 		goto out_close_client;
-- 
cgit v1.2.3


From 71a2fbaf9c91b0d863e11ec03f7168893904fbab Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Sat, 20 Jul 2024 19:40:06 +0800
Subject: selftests/bpf: Drop __start_server in network_helpers

The helper start_server_addr() is a wrapper of __start_server(), the
only difference between them is __start_server() accepts a sockaddr type
address parameter, but start_server_addr() accepts a sockaddr_storage one.

This patch drops __start_server(), and updates the callers to invoke
start_server_addr() instead.

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Link: https://lore.kernel.org/r/31399df7cb957b7c233e79963b0aa0dc4278d273.1721475357.git.tanggeliang@kylinos.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 9c98a60cf1e2..a3f0a49fb26f 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -80,12 +80,15 @@ int settimeo(int fd, int timeout_ms)
 
 #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; })
 
-static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen,
-			  const struct network_helper_opts *opts)
+int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen,
+		      const struct network_helper_opts *opts)
 {
 	int fd;
 
-	fd = socket(addr->sa_family, type, opts->proto);
+	if (!opts)
+		opts = &default_opts;
+
+	fd = socket(addr->ss_family, type, opts->proto);
 	if (fd < 0) {
 		log_err("Failed to create server socket");
 		return -1;
@@ -100,7 +103,7 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl
 		goto error_close;
 	}
 
-	if (bind(fd, addr, addrlen) < 0) {
+	if (bind(fd, (struct sockaddr *)addr, addrlen) < 0) {
 		log_err("Failed to bind socket");
 		goto error_close;
 	}
@@ -131,7 +134,7 @@ int start_server_str(int family, int type, const char *addr_str, __u16 port,
 	if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
 		return -1;
 
-	return __start_server(type, (struct sockaddr *)&addr, addrlen, opts);
+	return start_server_addr(type, &addr, addrlen, opts);
 }
 
 int start_server(int family, int type, const char *addr_str, __u16 port,
@@ -173,7 +176,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str,
 	if (!fds)
 		return NULL;
 
-	fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts);
+	fds[0] = start_server_addr(type, &addr, addrlen, &opts);
 	if (fds[0] == -1)
 		goto close_fds;
 	nr_fds = 1;
@@ -182,7 +185,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str,
 		goto close_fds;
 
 	for (; nr_fds < nr_listens; nr_fds++) {
-		fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts);
+		fds[nr_fds] = start_server_addr(type, &addr, addrlen, &opts);
 		if (fds[nr_fds] == -1)
 			goto close_fds;
 	}
@@ -194,15 +197,6 @@ close_fds:
 	return NULL;
 }
 
-int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len,
-		      const struct network_helper_opts *opts)
-{
-	if (!opts)
-		opts = &default_opts;
-
-	return __start_server(type, (struct sockaddr *)addr, len, opts);
-}
-
 void free_fds(int *fds, unsigned int nr_close_fds)
 {
 	if (fds) {
-- 
cgit v1.2.3


From c7db4873fbd63918c124a5d67182492c78ba705b Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 22 Jul 2024 22:14:55 -0700
Subject: selftests/bpf: Add a test for mmap-able map in map

Regular BPF hash map is not mmap-able from user space. However, map-in-map
with outer map of type BPF_MAP_TYPE_HASH_OF_MAPS and mmap-able array as
inner map can perform similar operations as a mmap-able hash map. This
can be used by applications that benefit from fast accesses to some local
data.

Add a selftest to show this use case.

Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240723051455.1589192-1-song@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../bpf/prog_tests/test_mmap_inner_array.c         | 57 ++++++++++++++++++++++
 .../testing/selftests/bpf/progs/mmap_inner_array.c | 57 ++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c
 create mode 100644 tools/testing/selftests/bpf/progs/mmap_inner_array.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c b/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c
new file mode 100644
index 000000000000..ce745776ed18
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include <sys/mman.h>
+#include "mmap_inner_array.skel.h"
+
+void test_mmap_inner_array(void)
+{
+	const long page_size = sysconf(_SC_PAGE_SIZE);
+	struct mmap_inner_array *skel;
+	int inner_array_fd, err;
+	void *tmp;
+	__u64 *val;
+
+	skel = mmap_inner_array__open_and_load();
+
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	inner_array_fd = bpf_map__fd(skel->maps.inner_array);
+	tmp = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, inner_array_fd, 0);
+	if (!ASSERT_OK_PTR(tmp, "inner array mmap"))
+		goto out;
+	val = (void *)tmp;
+
+	err = mmap_inner_array__attach(skel);
+	if (!ASSERT_OK(err, "attach"))
+		goto out_unmap;
+
+	skel->bss->pid = getpid();
+	usleep(1);
+
+	/* pid is set, pid_match == true and outer_map_match == false */
+	ASSERT_TRUE(skel->bss->pid_match, "pid match 1");
+	ASSERT_FALSE(skel->bss->outer_map_match, "outer map match 1");
+	ASSERT_FALSE(skel->bss->done, "done 1");
+	ASSERT_EQ(*val, 0, "value match 1");
+
+	err = bpf_map__update_elem(skel->maps.outer_map,
+				   &skel->bss->pid, sizeof(skel->bss->pid),
+				   &inner_array_fd, sizeof(inner_array_fd),
+				   BPF_ANY);
+	if (!ASSERT_OK(err, "update elem"))
+		goto out_unmap;
+	usleep(1);
+
+	/* outer map key is set, outer_map_match == true */
+	ASSERT_TRUE(skel->bss->pid_match, "pid match 2");
+	ASSERT_TRUE(skel->bss->outer_map_match, "outer map match 2");
+	ASSERT_TRUE(skel->bss->done, "done 2");
+	ASSERT_EQ(*val, skel->data->match_value, "value match 2");
+
+out_unmap:
+	munmap(tmp, page_size);
+out:
+	mmap_inner_array__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/mmap_inner_array.c b/tools/testing/selftests/bpf/progs/mmap_inner_array.c
new file mode 100644
index 000000000000..90aacbc2938a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mmap_inner_array.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct inner_array_type {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__type(key, __u32);
+	__type(value, __u64);
+	__uint(max_entries, 1);
+} inner_array SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+	__uint(key_size, 4);
+	__uint(value_size, 4);
+	__uint(max_entries, 1);
+	__array(values, struct inner_array_type);
+} outer_map SEC(".maps");
+
+int pid = 0;
+__u64 match_value = 0x13572468;
+bool done = false;
+bool pid_match = false;
+bool outer_map_match = false;
+
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
+int add_to_list_in_inner_array(void *ctx)
+{
+	__u32 curr_pid, zero = 0;
+	struct bpf_map *map;
+	__u64 *value;
+
+	curr_pid = (u32)bpf_get_current_pid_tgid();
+	if (done || curr_pid != pid)
+		return 0;
+
+	pid_match = true;
+	map = bpf_map_lookup_elem(&outer_map, &curr_pid);
+	if (!map)
+		return 0;
+
+	outer_map_match = true;
+	value = bpf_map_lookup_elem(map, &zero);
+	if (!value)
+		return 0;
+
+	*value = match_value;
+	done = true;
+	return 0;
+}
-- 
cgit v1.2.3


From a0ef659d03d2fc66c270cee791a1c5ebc0ff3bac Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Tue, 23 Jul 2024 03:07:00 +0000
Subject: selftests/bpf: Don't include .d files on make clean

Ignore generated %.test.o dependencies when make goal is clean or
docs-clean.

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/all/oNTIdax7aWGJdEgabzTqHzF4r-WTERrV1e1cNaPQMp-UhYUQpozXqkbuAlLBulczr6I99-jM5x3dxv56JJowaYBkm765R9Aa9kyrVuCl_kA=@pm.me
Link: https://lore.kernel.org/bpf/K69Y8OKMLXBWR0dtOfsC4J46-HxeQfvqoFx1CysCm7u19HRx4MB6yAKOFkM6X-KAx2EFuCcCh_9vYWpsgQXnAer8oQ8PMeDEuiRMYECuGH4=@pm.me
---
 tools/testing/selftests/bpf/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 05b234248b38..74f829952280 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -608,7 +608,9 @@ $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d:			\
 			    $(TRUNNER_BPF_SKELS_LINKED)			\
 			    $$(BPFOBJ) | $(TRUNNER_OUTPUT)
 
+ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),)
 include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d))
+endif
 
 $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o:				\
 		       %.c						\
-- 
cgit v1.2.3


From c7ad90736763b4e6bed10788ec07020a4f15dc32 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 17:13:28 -0700
Subject: selftests/bpf: Add missing system defines for mips

Update get_sys_includes in Makefile with missing MIPS-related definitions
to fix many, many compilation errors building selftests/bpf. The following
added defines drive conditional logic in system headers for word-size and
endianness selection:

  MIPSEL, MIPSEB
  _MIPS_SZPTR
  _MIPS_SZLONG
  _MIPS_SIM, _ABIO32, _ABIN32, _ABI64

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/f3cfceaf5299cdd2ac0e0a36072d6ca7be23e603.1721692479.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 74f829952280..a7c797dd075f 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -396,7 +396,8 @@ define get_sys_includes
 $(shell $(1) $(2) -v -E - </dev/null 2>&1 \
 	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
 $(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') \
-$(shell $(1) $(2) -dM -E - </dev/null | grep '__loongarch_grlen ' | awk '{printf("-D__BITS_PER_LONG=%d", $$3)}')
+$(shell $(1) $(2) -dM -E - </dev/null | grep '__loongarch_grlen ' | awk '{printf("-D__BITS_PER_LONG=%d", $$3)}') \
+$(shell $(1) $(2) -dM -E - </dev/null | grep -E 'MIPS(EL|EB)|_MIPS_SZ(PTR|LONG) |_MIPS_SIM |_ABI(O32|N32|64) ' | awk '{printf("-D%s=%s ", $$2, $$3)}')
 endef
 
 # Determine target endianness.
-- 
cgit v1.2.3


From a5f40d596bff182b4b47547712f540885e8fb17b Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 17:13:29 -0700
Subject: selftests/bpf: Fix error linking uprobe_multi on mips

Linking uprobe_multi.c on mips64el fails due to relocation overflows, when
the GOT entries required exceeds the default maximum. Add a specific CFLAGS
(-mxgot) for uprobe_multi.c on MIPS that allows using a larger GOT and
avoids errors such as:

  /tmp/ccBTNQzv.o: in function `bench':
  uprobe_multi.c:49:(.text+0x1d7720): relocation truncated to fit: R_MIPS_GOT_DISP against `uprobe_multi_func_08188'
  uprobe_multi.c:49:(.text+0x1d7730): relocation truncated to fit: R_MIPS_GOT_DISP against `uprobe_multi_func_08189'
  ...
  collect2: error: ld returned 1 exit status

Fixes: 519dfeaf5119 ("selftests/bpf: Add uprobe_multi test program")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/14eb7b70f8ccef9834874d75eb373cb9292129da.1721692479.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a7c797dd075f..8e01e4f9b1a7 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -786,6 +786,8 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
+# Linking uprobe_multi can fail due to relocation overflows on mips.
+$(OUTPUT)/uprobe_multi: CFLAGS += $(if $(filter mips, $(ARCH)),-mxgot)
 $(OUTPUT)/uprobe_multi: uprobe_multi.c
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@
-- 
cgit v1.2.3


From d17f9b370df628a3faaec7fd8be92a89ecc3e33a Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Tue, 23 Jul 2024 09:10:31 +0200
Subject: selftests/bpf: Fix compilation failure when CONFIG_NET_FOU!=y

Without CONFIG_NET_FOU bpf selftests are unable to build because of
missing definitions. Add ___local versions of struct bpf_fou_encap and
enum bpf_fou_encap_type to fix the issue.

Signed-off-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240723071031.3389423-1-asavkov@redhat.com
---
 .../testing/selftests/bpf/progs/test_tunnel_kern.c | 27 +++++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index 3f5abcf3ff13..32127f1cd687 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -26,6 +26,18 @@
  */
 #define ASSIGNED_ADDR_VETH1 0xac1001c8
 
+struct bpf_fou_encap___local {
+	__be16 sport;
+	__be16 dport;
+} __attribute__((preserve_access_index));
+
+enum bpf_fou_encap_type___local {
+	FOU_BPF_ENCAP_FOU___local,
+	FOU_BPF_ENCAP_GUE___local,
+};
+
+struct bpf_fou_encap;
+
 int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
 			  struct bpf_fou_encap *encap, int type) __ksym;
 int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
@@ -745,7 +757,7 @@ SEC("tc")
 int ipip_gue_set_tunnel(struct __sk_buff *skb)
 {
 	struct bpf_tunnel_key key = {};
-	struct bpf_fou_encap encap = {};
+	struct bpf_fou_encap___local encap = {};
 	void *data = (void *)(long)skb->data;
 	struct iphdr *iph = data;
 	void *data_end = (void *)(long)skb->data_end;
@@ -769,7 +781,9 @@ int ipip_gue_set_tunnel(struct __sk_buff *skb)
 	encap.sport = 0;
 	encap.dport = bpf_htons(5555);
 
-	ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_GUE);
+	ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap,
+				    bpf_core_enum_value(enum bpf_fou_encap_type___local,
+							FOU_BPF_ENCAP_GUE___local));
 	if (ret < 0) {
 		log_err(ret);
 		return TC_ACT_SHOT;
@@ -782,7 +796,7 @@ SEC("tc")
 int ipip_fou_set_tunnel(struct __sk_buff *skb)
 {
 	struct bpf_tunnel_key key = {};
-	struct bpf_fou_encap encap = {};
+	struct bpf_fou_encap___local encap = {};
 	void *data = (void *)(long)skb->data;
 	struct iphdr *iph = data;
 	void *data_end = (void *)(long)skb->data_end;
@@ -806,7 +820,8 @@ int ipip_fou_set_tunnel(struct __sk_buff *skb)
 	encap.sport = 0;
 	encap.dport = bpf_htons(5555);
 
-	ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_FOU);
+	ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap,
+				    FOU_BPF_ENCAP_FOU___local);
 	if (ret < 0) {
 		log_err(ret);
 		return TC_ACT_SHOT;
@@ -820,7 +835,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb)
 {
 	int ret;
 	struct bpf_tunnel_key key = {};
-	struct bpf_fou_encap encap = {};
+	struct bpf_fou_encap___local encap = {};
 
 	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
 	if (ret < 0) {
@@ -828,7 +843,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	ret = bpf_skb_get_fou_encap(skb, &encap);
+	ret = bpf_skb_get_fou_encap(skb, (struct bpf_fou_encap *)&encap);
 	if (ret < 0) {
 		log_err(ret);
 		return TC_ACT_SHOT;
-- 
cgit v1.2.3


From 30c2980c1092d5d69b2a3f460491145ebe0629ec Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 22 Jul 2024 22:27:57 +0200
Subject: selftests/bpf: Add uprobe fail tests for uprobe multi

Adding tests for checking on recovery after failing to
attach uprobe.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240722202758.3889061-2-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 118 +++++++++++++++++++++
 1 file changed, 118 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index bf6ca8e3eb13..e6255d4df81d 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -516,6 +516,122 @@ cleanup:
 	uprobe_multi__destroy(skel);
 }
 
+#ifdef __x86_64__
+noinline void uprobe_multi_error_func(void)
+{
+	/*
+	 * If --fcf-protection=branch is enabled the gcc generates endbr as
+	 * first instruction, so marking the exact address of int3 with the
+	 * symbol to be used in the attach_uprobe_fail_trap test below.
+	 */
+	asm volatile (
+		".globl uprobe_multi_error_func_int3;	\n"
+		"uprobe_multi_error_func_int3:		\n"
+		"int3					\n"
+	);
+}
+
+/*
+ * Attaching uprobe on uprobe_multi_error_func results in error
+ * because it already starts with int3 instruction.
+ */
+static void attach_uprobe_fail_trap(struct uprobe_multi *skel)
+{
+	LIBBPF_OPTS(bpf_uprobe_multi_opts, opts);
+	const char *syms[4] = {
+		"uprobe_multi_func_1",
+		"uprobe_multi_func_2",
+		"uprobe_multi_func_3",
+		"uprobe_multi_error_func_int3",
+	};
+
+	opts.syms = syms;
+	opts.cnt = ARRAY_SIZE(syms);
+
+	skel->links.uprobe = bpf_program__attach_uprobe_multi(skel->progs.uprobe, -1,
+							      "/proc/self/exe", NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.uprobe, "bpf_program__attach_uprobe_multi")) {
+		bpf_link__destroy(skel->links.uprobe);
+		skel->links.uprobe = NULL;
+	}
+}
+#else
+static void attach_uprobe_fail_trap(struct uprobe_multi *skel) { }
+#endif
+
+short sema_1 __used, sema_2 __used;
+
+static void attach_uprobe_fail_refctr(struct uprobe_multi *skel)
+{
+	unsigned long *tmp_offsets = NULL, *tmp_ref_ctr_offsets = NULL;
+	unsigned long offsets[3], ref_ctr_offsets[3];
+	LIBBPF_OPTS(bpf_link_create_opts, opts);
+	const char *path = "/proc/self/exe";
+	const char *syms[3] = {
+		"uprobe_multi_func_1",
+		"uprobe_multi_func_2",
+	};
+	const char *sema[3] = {
+		"sema_1",
+		"sema_2",
+	};
+	int prog_fd, link_fd, err;
+
+	prog_fd = bpf_program__fd(skel->progs.uprobe_extra);
+
+	err = elf_resolve_syms_offsets("/proc/self/exe", 2, (const char **) &syms,
+				       &tmp_offsets, STT_FUNC);
+	if (!ASSERT_OK(err, "elf_resolve_syms_offsets_func"))
+		return;
+
+	err = elf_resolve_syms_offsets("/proc/self/exe", 2, (const char **) &sema,
+				       &tmp_ref_ctr_offsets, STT_OBJECT);
+	if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema"))
+		goto cleanup;
+
+	/*
+	 * We attach to 3 uprobes on 2 functions, so 2 uprobes share single function,
+	 * but with different ref_ctr_offset which is not allowed and results in fail.
+	 */
+	offsets[0] = tmp_offsets[0]; /* uprobe_multi_func_1 */
+	offsets[1] = tmp_offsets[1]; /* uprobe_multi_func_2 */
+	offsets[2] = tmp_offsets[1]; /* uprobe_multi_func_2 */
+
+	ref_ctr_offsets[0] = tmp_ref_ctr_offsets[0]; /* sema_1 */
+	ref_ctr_offsets[1] = tmp_ref_ctr_offsets[1]; /* sema_2 */
+	ref_ctr_offsets[2] = tmp_ref_ctr_offsets[0]; /* sema_1, error */
+
+	opts.uprobe_multi.path = path;
+	opts.uprobe_multi.offsets = (const unsigned long *) &offsets;
+	opts.uprobe_multi.ref_ctr_offsets = (const unsigned long *) &ref_ctr_offsets;
+	opts.uprobe_multi.cnt = 3;
+
+	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts);
+	if (!ASSERT_ERR(link_fd, "link_fd"))
+		close(link_fd);
+
+cleanup:
+	free(tmp_ref_ctr_offsets);
+	free(tmp_offsets);
+}
+
+static void test_attach_uprobe_fails(void)
+{
+	struct uprobe_multi *skel = NULL;
+
+	skel = uprobe_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi__open_and_load"))
+		return;
+
+	/* attach fails due to adding uprobe on trap instruction, x86_64 only */
+	attach_uprobe_fail_trap(skel);
+
+	/* attach fail due to wrong ref_ctr_offs on one of the uprobes */
+	attach_uprobe_fail_refctr(skel);
+
+	uprobe_multi__destroy(skel);
+}
+
 static void __test_link_api(struct child *child)
 {
 	int prog_fd, link1_fd = -1, link2_fd = -1, link3_fd = -1, link4_fd = -1;
@@ -703,4 +819,6 @@ void test_uprobe_multi_test(void)
 		test_bench_attach_usdt();
 	if (test__start_subtest("attach_api_fails"))
 		test_attach_api_fails();
+	if (test__start_subtest("attach_uprobe_fails"))
+		test_attach_uprobe_fails();
 }
-- 
cgit v1.2.3


From 98adc743ae1febb401d635bebe46667af2692751 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 22 Jul 2024 22:27:58 +0200
Subject: selftests/bpf: Add uprobe multi consumers test

Adding test that attaches/detaches multiple consumers on
single uprobe and verifies all were hit as expected.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240722202758.3889061-3-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 213 +++++++++++++++++++++
 .../selftests/bpf/progs/uprobe_multi_consumers.c   |  39 ++++
 2 files changed, 252 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index e6255d4df81d..acb62675ff65 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -6,6 +6,7 @@
 #include "uprobe_multi.skel.h"
 #include "uprobe_multi_bench.skel.h"
 #include "uprobe_multi_usdt.skel.h"
+#include "uprobe_multi_consumers.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "testing_helpers.h"
 #include "../sdt.h"
@@ -731,6 +732,216 @@ static void test_link_api(void)
 	__test_link_api(child);
 }
 
+static struct bpf_program *
+get_program(struct uprobe_multi_consumers *skel, int prog)
+{
+	switch (prog) {
+	case 0:
+		return skel->progs.uprobe_0;
+	case 1:
+		return skel->progs.uprobe_1;
+	case 2:
+		return skel->progs.uprobe_2;
+	case 3:
+		return skel->progs.uprobe_3;
+	default:
+		ASSERT_FAIL("get_program");
+		return NULL;
+	}
+}
+
+static struct bpf_link **
+get_link(struct uprobe_multi_consumers *skel, int link)
+{
+	switch (link) {
+	case 0:
+		return &skel->links.uprobe_0;
+	case 1:
+		return &skel->links.uprobe_1;
+	case 2:
+		return &skel->links.uprobe_2;
+	case 3:
+		return &skel->links.uprobe_3;
+	default:
+		ASSERT_FAIL("get_link");
+		return NULL;
+	}
+}
+
+static int uprobe_attach(struct uprobe_multi_consumers *skel, int idx)
+{
+	struct bpf_program *prog = get_program(skel, idx);
+	struct bpf_link **link = get_link(skel, idx);
+	LIBBPF_OPTS(bpf_uprobe_multi_opts, opts);
+
+	if (!prog || !link)
+		return -1;
+
+	/*
+	 * bit/prog: 0,1 uprobe entry
+	 * bit/prog: 2,3 uprobe return
+	 */
+	opts.retprobe = idx == 2 || idx == 3;
+
+	*link = bpf_program__attach_uprobe_multi(prog, 0, "/proc/self/exe",
+						"uprobe_consumer_test",
+						&opts);
+	if (!ASSERT_OK_PTR(*link, "bpf_program__attach_uprobe_multi"))
+		return -1;
+	return 0;
+}
+
+static void uprobe_detach(struct uprobe_multi_consumers *skel, int idx)
+{
+	struct bpf_link **link = get_link(skel, idx);
+
+	bpf_link__destroy(*link);
+	*link = NULL;
+}
+
+static bool test_bit(int bit, unsigned long val)
+{
+	return val & (1 << bit);
+}
+
+noinline int
+uprobe_consumer_test(struct uprobe_multi_consumers *skel,
+		     unsigned long before, unsigned long after)
+{
+	int idx;
+
+	/* detach uprobe for each unset programs in 'before' state ... */
+	for (idx = 0; idx < 4; idx++) {
+		if (test_bit(idx, before) && !test_bit(idx, after))
+			uprobe_detach(skel, idx);
+	}
+
+	/* ... and attach all new programs in 'after' state */
+	for (idx = 0; idx < 4; idx++) {
+		if (!test_bit(idx, before) && test_bit(idx, after)) {
+			if (!ASSERT_OK(uprobe_attach(skel, idx), "uprobe_attach_after"))
+				return -1;
+		}
+	}
+	return 0;
+}
+
+static void consumer_test(struct uprobe_multi_consumers *skel,
+			  unsigned long before, unsigned long after)
+{
+	int err, idx;
+
+	printf("consumer_test before %lu after %lu\n", before, after);
+
+	/* 'before' is each, we attach uprobe for every set idx */
+	for (idx = 0; idx < 4; idx++) {
+		if (test_bit(idx, before)) {
+			if (!ASSERT_OK(uprobe_attach(skel, idx), "uprobe_attach_before"))
+				goto cleanup;
+		}
+	}
+
+	err = uprobe_consumer_test(skel, before, after);
+	if (!ASSERT_EQ(err, 0, "uprobe_consumer_test"))
+		goto cleanup;
+
+	for (idx = 0; idx < 4; idx++) {
+		const char *fmt = "BUG";
+		__u64 val = 0;
+
+		if (idx < 2) {
+			/*
+			 * uprobe entry
+			 *   +1 if define in 'before'
+			 */
+			if (test_bit(idx, before))
+				val++;
+			fmt = "prog 0/1: uprobe";
+		} else {
+			/*
+			 * uprobe return is tricky ;-)
+			 *
+			 * to trigger uretprobe consumer, the uretprobe needs to be installed,
+			 * which means one of the 'return' uprobes was alive when probe was hit:
+			 *
+			 *   idxs: 2/3 uprobe return in 'installed' mask
+			 *
+			 * in addition if 'after' state removes everything that was installed in
+			 * 'before' state, then uprobe kernel object goes away and return uprobe
+			 * is not installed and we won't hit it even if it's in 'after' state.
+			 */
+			unsigned long had_uretprobes  = before & 0b1100; /* is uretprobe installed */
+			unsigned long probe_preserved = before & after;  /* did uprobe go away */
+
+			if (had_uretprobes && probe_preserved && test_bit(idx, after))
+				val++;
+			fmt = "idx 2/3: uretprobe";
+		}
+
+		ASSERT_EQ(skel->bss->uprobe_result[idx], val, fmt);
+		skel->bss->uprobe_result[idx] = 0;
+	}
+
+cleanup:
+	for (idx = 0; idx < 4; idx++)
+		uprobe_detach(skel, idx);
+}
+
+static void test_consumers(void)
+{
+	struct uprobe_multi_consumers *skel;
+	int before, after;
+
+	skel = uprobe_multi_consumers__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi_consumers__open_and_load"))
+		return;
+
+	/*
+	 * The idea of this test is to try all possible combinations of
+	 * uprobes consumers attached on single function.
+	 *
+	 *  - 2 uprobe entry consumer
+	 *  - 2 uprobe exit consumers
+	 *
+	 * The test uses 4 uprobes attached on single function, but that
+	 * translates into single uprobe with 4 consumers in kernel.
+	 *
+	 * The before/after values present the state of attached consumers
+	 * before and after the probed function:
+	 *
+	 *  bit/prog 0,1 : uprobe entry
+	 *  bit/prog 2,3 : uprobe return
+	 *
+	 * For example for:
+	 *
+	 *   before = 0b0101
+	 *   after  = 0b0110
+	 *
+	 * it means that before we call 'uprobe_consumer_test' we attach
+	 * uprobes defined in 'before' value:
+	 *
+	 *   - bit/prog 0: uprobe entry
+	 *   - bit/prog 2: uprobe return
+	 *
+	 * uprobe_consumer_test is called and inside it we attach and detach
+	 * uprobes based on 'after' value:
+	 *
+	 *   - bit/prog 0: stays untouched
+	 *   - bit/prog 2: uprobe return is detached
+	 *
+	 * uprobe_consumer_test returns and we check counters values increased
+	 * by bpf programs on each uprobe to match the expected count based on
+	 * before/after bits.
+	 */
+
+	for (before = 0; before < 16; before++) {
+		for (after = 0; after < 16; after++)
+			consumer_test(skel, before, after);
+	}
+
+	uprobe_multi_consumers__destroy(skel);
+}
+
 static void test_bench_attach_uprobe(void)
 {
 	long attach_start_ns = 0, attach_end_ns = 0;
@@ -821,4 +1032,6 @@ void test_uprobe_multi_test(void)
 		test_attach_api_fails();
 	if (test__start_subtest("attach_uprobe_fails"))
 		test_attach_uprobe_fails();
+	if (test__start_subtest("consumers"))
+		test_consumers();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c b/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c
new file mode 100644
index 000000000000..7e0fdcbbd242
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <stdbool.h>
+#include "bpf_kfuncs.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+__u64 uprobe_result[4];
+
+SEC("uprobe.multi")
+int uprobe_0(struct pt_regs *ctx)
+{
+	uprobe_result[0]++;
+	return 0;
+}
+
+SEC("uprobe.multi")
+int uprobe_1(struct pt_regs *ctx)
+{
+	uprobe_result[1]++;
+	return 0;
+}
+
+SEC("uprobe.multi")
+int uprobe_2(struct pt_regs *ctx)
+{
+	uprobe_result[2]++;
+	return 0;
+}
+
+SEC("uprobe.multi")
+int uprobe_3(struct pt_regs *ctx)
+{
+	uprobe_result[3]++;
+	return 0;
+}
-- 
cgit v1.2.3


From 3ece93a4087b2db7b99ebb2412bd60cf26bbbb51 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Fri, 19 Jul 2024 22:25:35 -0700
Subject: selftests/bpf: Fix wrong binary in Makefile log output

Make log output incorrectly shows 'test_maps' as the binary name for every
'CLNG-BPF' build step, apparently picking up the last value defined for the
$(TRUNNER_BINARY) variable. Update the 'CLANG_BPF_BUILD_RULE' variants to
fix this confusing output.

Current output:
  CLNG-BPF [test_maps] access_map_in_map.bpf.o
  GEN-SKEL [test_progs] access_map_in_map.skel.h
  ...
  CLNG-BPF [test_maps] access_map_in_map.bpf.o
  GEN-SKEL [test_progs-no_alu32] access_map_in_map.skel.h
  ...
  CLNG-BPF [test_maps] access_map_in_map.bpf.o
  GEN-SKEL [test_progs-cpuv4] access_map_in_map.skel.h

After fix:
  CLNG-BPF [test_progs] access_map_in_map.bpf.o
  GEN-SKEL [test_progs] access_map_in_map.skel.h
  ...
  CLNG-BPF [test_progs-no_alu32] access_map_in_map.bpf.o
  GEN-SKEL [test_progs-no_alu32] access_map_in_map.skel.h
  ...
  CLNG-BPF [test_progs-cpuv4] access_map_in_map.bpf.o
  GEN-SKEL [test_progs-cpuv4] access_map_in_map.skel.h

Fixes: a5d0c26a2784 ("selftests/bpf: Add a cpuv4 test runner for cpu=v4 testing")
Fixes: 89ad7420b25c ("selftests/bpf: Drop the need for LLVM's llc")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240720052535.2185967-1-tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/Makefile | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 8e01e4f9b1a7..3318d5425d75 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -428,23 +428,24 @@ $(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h
 # $1 - input .c file
 # $2 - output .o file
 # $3 - CFLAGS
+# $4 - binary name
 define CLANG_BPF_BUILD_RULE
-	$(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2)
+	$(call msg,CLNG-BPF,$4,$2)
 	$(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v3 -o $2
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32
 define CLANG_NOALU32_BPF_BUILD_RULE
-	$(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2)
+	$(call msg,CLNG-BPF,$4,$2)
 	$(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v2 -o $2
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but with cpu-v4
 define CLANG_CPUV4_BPF_BUILD_RULE
-	$(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2)
+	$(call msg,CLNG-BPF,$4,$2)
 	$(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v4 -o $2
 endef
 # Build BPF object using GCC
 define GCC_BPF_BUILD_RULE
-	$(call msg,GCC-BPF,$(TRUNNER_BINARY),$2)
+	$(call msg,GCC-BPF,$4,$2)
 	$(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2
 endef
 
@@ -537,7 +538,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o:				\
 	$$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@,			\
 					  $(TRUNNER_BPF_CFLAGS)         \
 					  $$($$<-CFLAGS)		\
-					  $$($$<-$2-CFLAGS))
+					  $$($$<-$2-CFLAGS),$(TRUNNER_BINARY))
 
 $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
 	$$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@)
-- 
cgit v1.2.3


From f86601c3661946721e8f260bdd812b759854ac22 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 17:30:45 -0700
Subject: tools/runqslower: Fix LDFLAGS and add LDLIBS support

Actually use previously defined LDFLAGS during build and add support for
LDLIBS to link extra standalone libraries e.g. 'argp' which is not provided
by musl libc.

Fixes: 585bf4640ebe ("tools: runqslower: Add EXTRA_CFLAGS and EXTRA_LDFLAGS support")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/bpf/20240723003045.2273499-1-tony.ambardar@gmail.com
---
 tools/bpf/runqslower/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile
index d8288936c912..c4f1f1735af6 100644
--- a/tools/bpf/runqslower/Makefile
+++ b/tools/bpf/runqslower/Makefile
@@ -15,6 +15,7 @@ INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../include/uapi)
 CFLAGS := -g -Wall $(CLANG_CROSS_FLAGS)
 CFLAGS += $(EXTRA_CFLAGS)
 LDFLAGS += $(EXTRA_LDFLAGS)
+LDLIBS += -lelf -lz
 
 # Try to detect best kernel BTF source
 KERNEL_REL := $(shell uname -r)
@@ -51,7 +52,7 @@ clean:
 libbpf_hdrs: $(BPFOBJ)
 
 $(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ)
-	$(QUIET_LINK)$(CC) $(CFLAGS) $^ -lelf -lz -o $@
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@
 
 $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h	      \
 			$(OUTPUT)/runqslower.bpf.o | libbpf_hdrs
-- 
cgit v1.2.3


From 63a9936b45859b24780c86c32a32fc7b087c304e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 23 Jul 2024 08:34:44 -0700
Subject: selftests/bpf: Add tests for ldsx of pkt data/data_end/data_meta
 accesses

The following tests are added to verifier_ldsx.c:
  - sign extension of data/data_end/data_meta for tcx programs.
    The actual checking is in bpf_skb_is_valid_access() which
    is called by sk_filter, cg_skb, lwt, tc(tcx) and sk_skb.
  - sign extension of data/data_end/data_meta for xdp programs.
  - sign extension of data/data_end for flow_dissector programs.

All newly-added tests have verification failure with message
"invalid bpf_context access". Without previous patch, all these
tests succeeded verification.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240723153444.2430365-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_ldsx.c | 112 ++++++++++++++++++++++
 1 file changed, 112 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c
index d4427d8e1217..52edee41caf6 100644
--- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c
+++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c
@@ -144,6 +144,118 @@ __naked void ldsx_s32_range(void)
 	: __clobber_all);
 }
 
+SEC("xdp")
+__description("LDSX, xdp s32 xdp_md->data")
+__failure __msg("invalid bpf_context access")
+__naked void ldsx_ctx_1(void)
+{
+	asm volatile (
+	"r2 = *(s32 *)(r1 + %[xdp_md_data]);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_const(xdp_md_data, offsetof(struct xdp_md, data))
+	: __clobber_all);
+}
+
+SEC("xdp")
+__description("LDSX, xdp s32 xdp_md->data_end")
+__failure __msg("invalid bpf_context access")
+__naked void ldsx_ctx_2(void)
+{
+	asm volatile (
+	"r2 = *(s32 *)(r1 + %[xdp_md_data_end]);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_const(xdp_md_data_end, offsetof(struct xdp_md, data_end))
+	: __clobber_all);
+}
+
+SEC("xdp")
+__description("LDSX, xdp s32 xdp_md->data_meta")
+__failure __msg("invalid bpf_context access")
+__naked void ldsx_ctx_3(void)
+{
+	asm volatile (
+	"r2 = *(s32 *)(r1 + %[xdp_md_data_meta]);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_const(xdp_md_data_meta, offsetof(struct xdp_md, data_meta))
+	: __clobber_all);
+}
+
+SEC("tcx/ingress")
+__description("LDSX, tcx s32 __sk_buff->data")
+__failure __msg("invalid bpf_context access")
+__naked void ldsx_ctx_4(void)
+{
+	asm volatile (
+	"r2 = *(s32 *)(r1 + %[sk_buff_data]);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_const(sk_buff_data, offsetof(struct __sk_buff, data))
+	: __clobber_all);
+}
+
+SEC("tcx/ingress")
+__description("LDSX, tcx s32 __sk_buff->data_end")
+__failure __msg("invalid bpf_context access")
+__naked void ldsx_ctx_5(void)
+{
+	asm volatile (
+	"r2 = *(s32 *)(r1 + %[sk_buff_data_end]);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_const(sk_buff_data_end, offsetof(struct __sk_buff, data_end))
+	: __clobber_all);
+}
+
+SEC("tcx/ingress")
+__description("LDSX, tcx s32 __sk_buff->data_meta")
+__failure __msg("invalid bpf_context access")
+__naked void ldsx_ctx_6(void)
+{
+	asm volatile (
+	"r2 = *(s32 *)(r1 + %[sk_buff_data_meta]);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_const(sk_buff_data_meta, offsetof(struct __sk_buff, data_meta))
+	: __clobber_all);
+}
+
+SEC("flow_dissector")
+__description("LDSX, flow_dissector s32 __sk_buff->data")
+__failure __msg("invalid bpf_context access")
+__naked void ldsx_ctx_7(void)
+{
+	asm volatile (
+	"r2 = *(s32 *)(r1 + %[sk_buff_data]);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_const(sk_buff_data, offsetof(struct __sk_buff, data))
+	: __clobber_all);
+}
+
+SEC("flow_dissector")
+__description("LDSX, flow_dissector s32 __sk_buff->data_end")
+__failure __msg("invalid bpf_context access")
+__naked void ldsx_ctx_8(void)
+{
+	asm volatile (
+	"r2 = *(s32 *)(r1 + %[sk_buff_data_end]);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_const(sk_buff_data_end, offsetof(struct __sk_buff, data_end))
+	: __clobber_all);
+}
+
 #else
 
 SEC("socket")
-- 
cgit v1.2.3


From 26672b5caf3cbeeffe8651035dddff95f610e7ec Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 23 Jul 2024 09:29:40 -0700
Subject: selftests/bpf: Add reg_bounds tests for ldsx and subreg compare

Add a few reg_bounds selftests to test 32/16/8-bit ldsx and subreg comparison.
Without the previous patch, all added tests will fail.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240723162940.2732171-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/reg_bounds.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
index eb74363f9f70..0da4225749bd 100644
--- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
+++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
@@ -433,6 +433,19 @@ static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t,
 
 	y_cast = range_cast(y_t, x_t, y);
 
+	/* If we know that
+	 *   - *x* is in the range of signed 32bit value, and
+	 *   - *y_cast* range is 32-bit signed non-negative
+	 * then *x* range can be improved with *y_cast* such that *x* range
+	 * is 32-bit signed non-negative. Otherwise, if the new range for *x*
+	 * allows upper 32-bit * 0xffffffff then the eventual new range for
+	 * *x* will be out of signed 32-bit range which violates the origin
+	 * *x* range.
+	 */
+	if (x_t == S64 && y_t == S32 && y_cast.a <= S32_MAX  && y_cast.b <= S32_MAX &&
+	    (s64)x.a >= S32_MIN && (s64)x.b <= S32_MAX)
+		return range_improve(x_t, x, y_cast);
+
 	/* the case when new range knowledge, *y*, is a 32-bit subregister
 	 * range, while previous range knowledge, *x*, is a full register
 	 * 64-bit range, needs special treatment to take into account upper 32
@@ -2108,6 +2121,9 @@ static struct subtest_case crafted_cases[] = {
 	{S32, U32, {(u32)S32_MIN, 0}, {0, 0}},
 	{S32, U32, {(u32)S32_MIN, 0}, {(u32)S32_MIN, (u32)S32_MIN}},
 	{S32, U32, {(u32)S32_MIN, S32_MAX}, {S32_MAX, S32_MAX}},
+	{S64, U32, {0x0, 0x1f}, {0xffffffff80000000ULL, 0x000000007fffffffULL}},
+	{S64, U32, {0x0, 0x1f}, {0xffffffffffff8000ULL, 0x0000000000007fffULL}},
+	{S64, U32, {0x0, 0x1f}, {0xffffffffffffff80ULL, 0x000000000000007fULL}},
 };
 
 /* Go over crafted hard-coded cases. This is fast, so we do it as part of
-- 
cgit v1.2.3


From 424ebaa3678b0d7f653dffb08eae90424740e0f4 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 22 Jul 2024 16:38:38 -0700
Subject: selftests/bpf: extract utility function for BPF disassembly

struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz);

  Disassembles instruction 'insn' to a text buffer 'buf'.
  Removes insn->code hex prefix added by kernel disassembly routine.
  Returns a pointer to the next instruction
  (increments insn by either 1 or 2).

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240722233844.1406874-5-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/Makefile               |  1 +
 tools/testing/selftests/bpf/disasm_helpers.c       | 51 +++++++++++++++
 tools/testing/selftests/bpf/disasm_helpers.h       | 12 ++++
 .../testing/selftests/bpf/prog_tests/ctx_rewrite.c | 74 +++-------------------
 tools/testing/selftests/bpf/testing_helpers.c      |  1 +
 5 files changed, 75 insertions(+), 64 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/disasm_helpers.c
 create mode 100644 tools/testing/selftests/bpf/disasm_helpers.h

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3318d5425d75..888ba68e6592 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -661,6 +661,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c		\
 			 test_loader.c		\
 			 xsk.c			\
 			 disasm.c		\
+			 disasm_helpers.c	\
 			 json_writer.c 		\
 			 flow_dissector_load.h	\
 			 ip_check_defrag_frags.h
diff --git a/tools/testing/selftests/bpf/disasm_helpers.c b/tools/testing/selftests/bpf/disasm_helpers.c
new file mode 100644
index 000000000000..96b1f2ffe438
--- /dev/null
+++ b/tools/testing/selftests/bpf/disasm_helpers.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+#include <bpf/bpf.h>
+#include "disasm.h"
+
+struct print_insn_context {
+	char *buf;
+	size_t sz;
+};
+
+static void print_insn_cb(void *private_data, const char *fmt, ...)
+{
+	struct print_insn_context *ctx = private_data;
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(ctx->buf, ctx->sz, fmt, args);
+	va_end(args);
+}
+
+struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz)
+{
+	struct print_insn_context ctx = {
+		.buf = buf,
+		.sz = buf_sz,
+	};
+	struct bpf_insn_cbs cbs = {
+		.cb_print	= print_insn_cb,
+		.private_data	= &ctx,
+	};
+	char *tmp, *pfx_end, *sfx_start;
+	bool double_insn;
+	int len;
+
+	print_bpf_insn(&cbs, insn, true);
+	/* We share code with kernel BPF disassembler, it adds '(FF) ' prefix
+	 * for each instruction (FF stands for instruction `code` byte).
+	 * Remove the prefix inplace, and also simplify call instructions.
+	 * E.g.: "(85) call foo#10" -> "call foo".
+	 * Also remove newline in the end (the 'max(strlen(buf) - 1, 0)' thing).
+	 */
+	pfx_end = buf + 5;
+	sfx_start = buf + max((int)strlen(buf) - 1, 0);
+	if (strncmp(pfx_end, "call ", 5) == 0 && (tmp = strrchr(buf, '#')))
+		sfx_start = tmp;
+	len = sfx_start - pfx_end;
+	memmove(buf, pfx_end, len);
+	buf[len] = 0;
+	double_insn = insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+	return insn + (double_insn ? 2 : 1);
+}
diff --git a/tools/testing/selftests/bpf/disasm_helpers.h b/tools/testing/selftests/bpf/disasm_helpers.h
new file mode 100644
index 000000000000..7b26cab70099
--- /dev/null
+++ b/tools/testing/selftests/bpf/disasm_helpers.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+#ifndef __DISASM_HELPERS_H
+#define __DISASM_HELPERS_H
+
+#include <stdlib.h>
+
+struct bpf_insn;
+
+struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz);
+
+#endif /* __DISASM_HELPERS_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
index 08b6391f2f56..dd75ccb03770 100644
--- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
@@ -10,7 +10,8 @@
 #include "bpf/btf.h"
 #include "bpf_util.h"
 #include "linux/filter.h"
-#include "disasm.h"
+#include "linux/kernel.h"
+#include "disasm_helpers.h"
 
 #define MAX_PROG_TEXT_SZ (32 * 1024)
 
@@ -628,63 +629,6 @@ err:
 	return false;
 }
 
-static void print_insn(void *private_data, const char *fmt, ...)
-{
-	va_list args;
-
-	va_start(args, fmt);
-	vfprintf((FILE *)private_data, fmt, args);
-	va_end(args);
-}
-
-/* Disassemble instructions to a stream */
-static void print_xlated(FILE *out, struct bpf_insn *insn, __u32 len)
-{
-	const struct bpf_insn_cbs cbs = {
-		.cb_print	= print_insn,
-		.cb_call	= NULL,
-		.cb_imm		= NULL,
-		.private_data	= out,
-	};
-	bool double_insn = false;
-	int i;
-
-	for (i = 0; i < len; i++) {
-		if (double_insn) {
-			double_insn = false;
-			continue;
-		}
-
-		double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW);
-		print_bpf_insn(&cbs, insn + i, true);
-	}
-}
-
-/* We share code with kernel BPF disassembler, it adds '(FF) ' prefix
- * for each instruction (FF stands for instruction `code` byte).
- * This function removes the prefix inplace for each line in `str`.
- */
-static void remove_insn_prefix(char *str, int size)
-{
-	const int prefix_size = 5;
-
-	int write_pos = 0, read_pos = prefix_size;
-	int len = strlen(str);
-	char c;
-
-	size = min(size, len);
-
-	while (read_pos < size) {
-		c = str[read_pos++];
-		if (c == 0)
-			break;
-		str[write_pos++] = c;
-		if (c == '\n')
-			read_pos += prefix_size;
-	}
-	str[write_pos] = 0;
-}
-
 struct prog_info {
 	char *prog_kind;
 	enum bpf_prog_type prog_type;
@@ -699,9 +643,10 @@ static void match_program(struct btf *btf,
 			  char *reg_map[][2],
 			  bool skip_first_insn)
 {
-	struct bpf_insn *buf = NULL;
+	struct bpf_insn *buf = NULL, *insn, *insn_end;
 	int err = 0, prog_fd = 0;
 	FILE *prog_out = NULL;
+	char insn_buf[64];
 	char *text = NULL;
 	__u32 cnt = 0;
 
@@ -739,12 +684,13 @@ static void match_program(struct btf *btf,
 		PRINT_FAIL("Can't open memory stream\n");
 		goto out;
 	}
-	if (skip_first_insn)
-		print_xlated(prog_out, buf + 1, cnt - 1);
-	else
-		print_xlated(prog_out, buf, cnt);
+	insn_end = buf + cnt;
+	insn = buf + (skip_first_insn ? 1 : 0);
+	while (insn < insn_end) {
+		insn = disasm_insn(insn, insn_buf, sizeof(insn_buf));
+		fprintf(prog_out, "%s\n", insn_buf);
+	}
 	fclose(prog_out);
-	remove_insn_prefix(text, MAX_PROG_TEXT_SZ);
 
 	ASSERT_TRUE(match_pattern(btf, pattern, text, reg_map),
 		    pinfo->prog_kind);
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index d5379a0e6da8..ac7c66f4fc7b 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -7,6 +7,7 @@
 #include <errno.h>
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
+#include "disasm.h"
 #include "test_progs.h"
 #include "testing_helpers.h"
 #include <linux/membarrier.h>
-- 
cgit v1.2.3


From 203e6aba7692bca18fd97251b1354da0f5e2ba30 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 22 Jul 2024 16:38:39 -0700
Subject: selftests/bpf: print correct offset for pseudo calls in disasm_insn()

Adjust disasm_helpers.c:disasm_insn() to account for the following
part of the verifier.c:jit_subprogs:

  for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
        /* ... */
        if (!bpf_pseudo_call(insn))
                continue;
        insn->off = env->insn_aux_data[i].call_imm;
        subprog = find_subprog(env, i + insn->off + 1);
        insn->imm = subprog;
  }

Where verifier moves offset of the subprogram to the insn->off field.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240722233844.1406874-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/disasm_helpers.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/disasm_helpers.c b/tools/testing/selftests/bpf/disasm_helpers.c
index 96b1f2ffe438..f529f1c8c171 100644
--- a/tools/testing/selftests/bpf/disasm_helpers.c
+++ b/tools/testing/selftests/bpf/disasm_helpers.c
@@ -4,6 +4,7 @@
 #include "disasm.h"
 
 struct print_insn_context {
+	char scratch[16];
 	char *buf;
 	size_t sz;
 };
@@ -18,6 +19,22 @@ static void print_insn_cb(void *private_data, const char *fmt, ...)
 	va_end(args);
 }
 
+static const char *print_call_cb(void *private_data, const struct bpf_insn *insn)
+{
+	struct print_insn_context *ctx = private_data;
+
+	/* For pseudo calls verifier.c:jit_subprogs() hides original
+	 * imm to insn->off and changes insn->imm to be an index of
+	 * the subprog instead.
+	 */
+	if (insn->src_reg == BPF_PSEUDO_CALL) {
+		snprintf(ctx->scratch, sizeof(ctx->scratch), "%+d", insn->off);
+		return ctx->scratch;
+	}
+
+	return NULL;
+}
+
 struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz)
 {
 	struct print_insn_context ctx = {
@@ -26,6 +43,7 @@ struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz)
 	};
 	struct bpf_insn_cbs cbs = {
 		.cb_print	= print_insn_cb,
+		.cb_call	= print_call_cb,
 		.private_data	= &ctx,
 	};
 	char *tmp, *pfx_end, *sfx_start;
-- 
cgit v1.2.3


From 4ef5d6af493558124b7a6c13cace58b938fe27d4 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 22 Jul 2024 16:38:40 -0700
Subject: selftests/bpf: no need to track next_match_pos in struct test_loader

The call stack for validate_case() function looks as follows:
- test_loader__run_subtests()
  - process_subtest()
    - run_subtest()
      - prepare_case(), which does 'tester->next_match_pos = 0';
      - validate_case(), which increments tester->next_match_pos.

Hence, each subtest is run with next_match_pos freshly set to zero.
Meaning that there is no need to persist this variable in the
struct test_loader, use local variable instead.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240722233844.1406874-7-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/test_loader.c | 19 ++++++++-----------
 tools/testing/selftests/bpf/test_progs.h  |  1 -
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index f14e10b0de96..47508cf66e89 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -434,7 +434,6 @@ static void prepare_case(struct test_loader *tester,
 	bpf_program__set_flags(prog, prog_flags | spec->prog_flags);
 
 	tester->log_buf[0] = '\0';
-	tester->next_match_pos = 0;
 }
 
 static void emit_verifier_log(const char *log_buf, bool force)
@@ -450,25 +449,23 @@ static void validate_case(struct test_loader *tester,
 			  struct bpf_program *prog,
 			  int load_err)
 {
-	int i, j, err;
-	char *match;
 	regmatch_t reg_match[1];
+	const char *log = tester->log_buf;
+	int i, j, err;
 
 	for (i = 0; i < subspec->expect_msg_cnt; i++) {
 		struct expect_msg *msg = &subspec->expect_msgs[i];
+		const char *match = NULL;
 
 		if (msg->substr) {
-			match = strstr(tester->log_buf + tester->next_match_pos, msg->substr);
+			match = strstr(log, msg->substr);
 			if (match)
-				tester->next_match_pos = match - tester->log_buf + strlen(msg->substr);
+				log += strlen(msg->substr);
 		} else {
-			err = regexec(&msg->regex,
-				      tester->log_buf + tester->next_match_pos, 1, reg_match, 0);
+			err = regexec(&msg->regex, log, 1, reg_match, 0);
 			if (err == 0) {
-				match = tester->log_buf + tester->next_match_pos + reg_match[0].rm_so;
-				tester->next_match_pos += reg_match[0].rm_eo;
-			} else {
-				match = NULL;
+				match = log + reg_match[0].rm_so;
+				log += reg_match[0].rm_eo;
 			}
 		}
 
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 51341d50213b..b1e949fb16cf 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -447,7 +447,6 @@ typedef int (*pre_execution_cb)(struct bpf_object *obj);
 struct test_loader {
 	char *log_buf;
 	size_t log_buf_sz;
-	size_t next_match_pos;
 	pre_execution_cb pre_execution_cb;
 
 	struct bpf_object *obj;
-- 
cgit v1.2.3


From 64f01e935ddb26f48baec71883c27878ac4231dc Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 22 Jul 2024 16:38:41 -0700
Subject: selftests/bpf: extract test_loader->expect_msgs as a data structure

Non-functional change: use a separate data structure to represented
expected messages in test_loader.
This would allow to use the same functionality for expected set of
disassembled instructions in the follow-up commit.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240722233844.1406874-8-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/test_loader.c | 81 ++++++++++++++++---------------
 1 file changed, 41 insertions(+), 40 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 47508cf66e89..3f84903558dd 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -55,11 +55,15 @@ struct expect_msg {
 	regex_t regex;
 };
 
+struct expected_msgs {
+	struct expect_msg *patterns;
+	size_t cnt;
+};
+
 struct test_subspec {
 	char *name;
 	bool expect_failure;
-	struct expect_msg *expect_msgs;
-	size_t expect_msg_cnt;
+	struct expected_msgs expect_msgs;
 	int retval;
 	bool execute;
 };
@@ -96,44 +100,45 @@ void test_loader_fini(struct test_loader *tester)
 	free(tester->log_buf);
 }
 
-static void free_test_spec(struct test_spec *spec)
+static void free_msgs(struct expected_msgs *msgs)
 {
 	int i;
 
+	for (i = 0; i < msgs->cnt; i++)
+		if (msgs->patterns[i].regex_str)
+			regfree(&msgs->patterns[i].regex);
+	free(msgs->patterns);
+	msgs->patterns = NULL;
+	msgs->cnt = 0;
+}
+
+static void free_test_spec(struct test_spec *spec)
+{
 	/* Deallocate expect_msgs arrays. */
-	for (i = 0; i < spec->priv.expect_msg_cnt; i++)
-		if (spec->priv.expect_msgs[i].regex_str)
-			regfree(&spec->priv.expect_msgs[i].regex);
-	for (i = 0; i < spec->unpriv.expect_msg_cnt; i++)
-		if (spec->unpriv.expect_msgs[i].regex_str)
-			regfree(&spec->unpriv.expect_msgs[i].regex);
+	free_msgs(&spec->priv.expect_msgs);
+	free_msgs(&spec->unpriv.expect_msgs);
 
 	free(spec->priv.name);
 	free(spec->unpriv.name);
-	free(spec->priv.expect_msgs);
-	free(spec->unpriv.expect_msgs);
-
 	spec->priv.name = NULL;
 	spec->unpriv.name = NULL;
-	spec->priv.expect_msgs = NULL;
-	spec->unpriv.expect_msgs = NULL;
 }
 
-static int push_msg(const char *substr, const char *regex_str, struct test_subspec *subspec)
+static int push_msg(const char *substr, const char *regex_str, struct expected_msgs *msgs)
 {
 	void *tmp;
 	int regcomp_res;
 	char error_msg[100];
 	struct expect_msg *msg;
 
-	tmp = realloc(subspec->expect_msgs,
-		      (1 + subspec->expect_msg_cnt) * sizeof(struct expect_msg));
+	tmp = realloc(msgs->patterns,
+		      (1 + msgs->cnt) * sizeof(struct expect_msg));
 	if (!tmp) {
 		ASSERT_FAIL("failed to realloc memory for messages\n");
 		return -ENOMEM;
 	}
-	subspec->expect_msgs = tmp;
-	msg = &subspec->expect_msgs[subspec->expect_msg_cnt];
+	msgs->patterns = tmp;
+	msg = &msgs->patterns[msgs->cnt];
 
 	if (substr) {
 		msg->substr = substr;
@@ -150,7 +155,7 @@ static int push_msg(const char *substr, const char *regex_str, struct test_subsp
 		}
 	}
 
-	subspec->expect_msg_cnt += 1;
+	msgs->cnt += 1;
 	return 0;
 }
 
@@ -272,25 +277,25 @@ static int parse_test_spec(struct test_loader *tester,
 			spec->mode_mask |= UNPRIV;
 		} else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) {
 			msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1;
-			err = push_msg(msg, NULL, &spec->priv);
+			err = push_msg(msg, NULL, &spec->priv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= PRIV;
 		} else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV)) {
 			msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX_UNPRIV) - 1;
-			err = push_msg(msg, NULL, &spec->unpriv);
+			err = push_msg(msg, NULL, &spec->unpriv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
 		} else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX)) {
 			msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX) - 1;
-			err = push_msg(NULL, msg, &spec->priv);
+			err = push_msg(NULL, msg, &spec->priv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= PRIV;
 		} else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV)) {
 			msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX_UNPRIV) - 1;
-			err = push_msg(NULL, msg, &spec->unpriv);
+			err = push_msg(NULL, msg, &spec->unpriv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
@@ -387,11 +392,12 @@ static int parse_test_spec(struct test_loader *tester,
 			spec->unpriv.execute = spec->priv.execute;
 		}
 
-		if (!spec->unpriv.expect_msgs) {
-			for (i = 0; i < spec->priv.expect_msg_cnt; i++) {
-				struct expect_msg *msg = &spec->priv.expect_msgs[i];
+		if (spec->unpriv.expect_msgs.cnt == 0) {
+			for (i = 0; i < spec->priv.expect_msgs.cnt; i++) {
+				struct expect_msg *msg = &spec->priv.expect_msgs.patterns[i];
 
-				err = push_msg(msg->substr, msg->regex_str, &spec->unpriv);
+				err = push_msg(msg->substr, msg->regex_str,
+					       &spec->unpriv.expect_msgs);
 				if (err)
 					goto cleanup;
 			}
@@ -443,18 +449,14 @@ static void emit_verifier_log(const char *log_buf, bool force)
 	fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf);
 }
 
-static void validate_case(struct test_loader *tester,
-			  struct test_subspec *subspec,
-			  struct bpf_object *obj,
-			  struct bpf_program *prog,
-			  int load_err)
+static void validate_msgs(char *log_buf, struct expected_msgs *msgs)
 {
 	regmatch_t reg_match[1];
-	const char *log = tester->log_buf;
+	const char *log = log_buf;
 	int i, j, err;
 
-	for (i = 0; i < subspec->expect_msg_cnt; i++) {
-		struct expect_msg *msg = &subspec->expect_msgs[i];
+	for (i = 0; i < msgs->cnt; i++) {
+		struct expect_msg *msg = &msgs->patterns[i];
 		const char *match = NULL;
 
 		if (msg->substr) {
@@ -471,9 +473,9 @@ static void validate_case(struct test_loader *tester,
 
 		if (!ASSERT_OK_PTR(match, "expect_msg")) {
 			if (env.verbosity == VERBOSE_NONE)
-				emit_verifier_log(tester->log_buf, true /*force*/);
+				emit_verifier_log(log_buf, true /*force*/);
 			for (j = 0; j <= i; j++) {
-				msg = &subspec->expect_msgs[j];
+				msg = &msgs->patterns[j];
 				fprintf(stderr, "%s %s: '%s'\n",
 					j < i ? "MATCHED " : "EXPECTED",
 					msg->substr ? "SUBSTR" : " REGEX",
@@ -692,9 +694,8 @@ void run_subtest(struct test_loader *tester,
 			goto tobj_cleanup;
 		}
 	}
-
 	emit_verifier_log(tester->log_buf, false /*force*/);
-	validate_case(tester, subspec, tobj, tprog, err);
+	validate_msgs(tester->log_buf, &subspec->expect_msgs);
 
 	if (should_do_test_run(spec, subspec)) {
 		/* For some reason test_verifier executes programs
-- 
cgit v1.2.3


From 9c9f7339131030949a8ef111080427ff1a8085b5 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 22 Jul 2024 16:38:42 -0700
Subject: selftests/bpf: allow checking xlated programs in verifier_* tests

Add a macro __xlated("...") for use with test_loader tests.

When such annotations are present for the test case:
- bpf_prog_get_info_by_fd() is used to get BPF program after all
  rewrites are applied by verifier.
- the program is disassembled and patterns specified in __xlated are
  searched for in the disassembly text.

__xlated matching follows the same mechanics as __msg:
each subsequent pattern is matched from the point where
previous pattern ended.

This allows to write tests like below, where the goal is to verify the
behavior of one of the of the transformations applied by verifier:

    SEC("raw_tp")
    __xlated("1: w0 = ")
    __xlated("2: r0 = &(void __percpu *)(r0)")
    __xlated("3: r0 = *(u32 *)(r0 +0)")
    __xlated("4: exit")
    __success __naked void simple(void)
    {
            asm volatile (
            "call %[bpf_get_smp_processor_id];"
            "exit;"
            :
            : __imm(bpf_get_smp_processor_id)
            : __clobber_all);
    }

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240722233844.1406874-9-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/progs/bpf_misc.h |  5 ++
 tools/testing/selftests/bpf/test_loader.c    | 82 +++++++++++++++++++++++++++-
 2 files changed, 84 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 81097a3f15eb..a70939c7bc26 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -26,6 +26,9 @@
  *
  * __regex           Same as __msg, but using a regular expression.
  * __regex_unpriv    Same as __msg_unpriv but using a regular expression.
+ * __xlated          Expect a line in a disassembly log after verifier applies rewrites.
+ *                   Multiple __xlated attributes could be specified.
+ * __xlated_unpriv   Same as __xlated but for unprivileged mode.
  *
  * __success         Expect program load success in privileged mode.
  * __success_unpriv  Expect program load success in unprivileged mode.
@@ -63,11 +66,13 @@
  */
 #define __msg(msg)		__attribute__((btf_decl_tag("comment:test_expect_msg=" msg)))
 #define __regex(regex)		__attribute__((btf_decl_tag("comment:test_expect_regex=" regex)))
+#define __xlated(msg)		__attribute__((btf_decl_tag("comment:test_expect_xlated=" msg)))
 #define __failure		__attribute__((btf_decl_tag("comment:test_expect_failure")))
 #define __success		__attribute__((btf_decl_tag("comment:test_expect_success")))
 #define __description(desc)	__attribute__((btf_decl_tag("comment:test_description=" desc)))
 #define __msg_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" msg)))
 #define __regex_unpriv(regex)	__attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" regex)))
+#define __xlated_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" msg)))
 #define __failure_unpriv	__attribute__((btf_decl_tag("comment:test_expect_failure_unpriv")))
 #define __success_unpriv	__attribute__((btf_decl_tag("comment:test_expect_success_unpriv")))
 #define __log_level(lvl)	__attribute__((btf_decl_tag("comment:test_log_level="#lvl)))
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 3f84903558dd..b44b6a2fc82c 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -7,6 +7,7 @@
 #include <bpf/btf.h>
 
 #include "autoconf_helper.h"
+#include "disasm_helpers.h"
 #include "unpriv_helpers.h"
 #include "cap_helpers.h"
 
@@ -19,10 +20,12 @@
 #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success"
 #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg="
 #define TEST_TAG_EXPECT_REGEX_PFX "comment:test_expect_regex="
+#define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated="
 #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv"
 #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv"
 #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv="
 #define TEST_TAG_EXPECT_REGEX_PFX_UNPRIV "comment:test_expect_regex_unpriv="
+#define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv="
 #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level="
 #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags="
 #define TEST_TAG_DESCRIPTION_PFX "comment:test_description="
@@ -64,6 +67,7 @@ struct test_subspec {
 	char *name;
 	bool expect_failure;
 	struct expected_msgs expect_msgs;
+	struct expected_msgs expect_xlated;
 	int retval;
 	bool execute;
 };
@@ -117,6 +121,8 @@ static void free_test_spec(struct test_spec *spec)
 	/* Deallocate expect_msgs arrays. */
 	free_msgs(&spec->priv.expect_msgs);
 	free_msgs(&spec->unpriv.expect_msgs);
+	free_msgs(&spec->priv.expect_xlated);
+	free_msgs(&spec->unpriv.expect_xlated);
 
 	free(spec->priv.name);
 	free(spec->unpriv.name);
@@ -299,6 +305,18 @@ static int parse_test_spec(struct test_loader *tester,
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
+		} else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX)) {
+			msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX) - 1;
+			err = push_msg(msg, NULL, &spec->priv.expect_xlated);
+			if (err)
+				goto cleanup;
+			spec->mode_mask |= PRIV;
+		} else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV)) {
+			msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX_UNPRIV) - 1;
+			err = push_msg(msg, NULL, &spec->unpriv.expect_xlated);
+			if (err)
+				goto cleanup;
+			spec->mode_mask |= UNPRIV;
 		} else if (str_has_pfx(s, TEST_TAG_RETVAL_PFX)) {
 			val = s + sizeof(TEST_TAG_RETVAL_PFX) - 1;
 			err = parse_retval(val, &spec->priv.retval, "__retval");
@@ -402,6 +420,16 @@ static int parse_test_spec(struct test_loader *tester,
 					goto cleanup;
 			}
 		}
+		if (spec->unpriv.expect_xlated.cnt == 0) {
+			for (i = 0; i < spec->priv.expect_xlated.cnt; i++) {
+				struct expect_msg *msg = &spec->priv.expect_xlated.patterns[i];
+
+				err = push_msg(msg->substr, msg->regex_str,
+					       &spec->unpriv.expect_xlated);
+				if (err)
+					goto cleanup;
+			}
+		}
 	}
 
 	spec->valid = true;
@@ -449,7 +477,15 @@ static void emit_verifier_log(const char *log_buf, bool force)
 	fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf);
 }
 
-static void validate_msgs(char *log_buf, struct expected_msgs *msgs)
+static void emit_xlated(const char *xlated, bool force)
+{
+	if (!force && env.verbosity == VERBOSE_NONE)
+		return;
+	fprintf(stdout, "XLATED:\n=============\n%s=============\n", xlated);
+}
+
+static void validate_msgs(char *log_buf, struct expected_msgs *msgs,
+			  void (*emit_fn)(const char *buf, bool force))
 {
 	regmatch_t reg_match[1];
 	const char *log = log_buf;
@@ -473,7 +509,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs)
 
 		if (!ASSERT_OK_PTR(match, "expect_msg")) {
 			if (env.verbosity == VERBOSE_NONE)
-				emit_verifier_log(log_buf, true /*force*/);
+				emit_fn(log_buf, true /*force*/);
 			for (j = 0; j <= i; j++) {
 				msg = &msgs->patterns[j];
 				fprintf(stderr, "%s %s: '%s'\n",
@@ -610,6 +646,37 @@ static bool should_do_test_run(struct test_spec *spec, struct test_subspec *subs
 	return true;
 }
 
+/* Get a disassembly of BPF program after verifier applies all rewrites */
+static int get_xlated_program_text(int prog_fd, char *text, size_t text_sz)
+{
+	struct bpf_insn *insn_start = NULL, *insn, *insn_end;
+	__u32 insns_cnt = 0, i;
+	char buf[64];
+	FILE *out = NULL;
+	int err;
+
+	err = get_xlated_program(prog_fd, &insn_start, &insns_cnt);
+	if (!ASSERT_OK(err, "get_xlated_program"))
+		goto out;
+	out = fmemopen(text, text_sz, "w");
+	if (!ASSERT_OK_PTR(out, "open_memstream"))
+		goto out;
+	insn_end = insn_start + insns_cnt;
+	insn = insn_start;
+	while (insn < insn_end) {
+		i = insn - insn_start;
+		insn = disasm_insn(insn, buf, sizeof(buf));
+		fprintf(out, "%d: %s\n", i, buf);
+	}
+	fflush(out);
+
+out:
+	free(insn_start);
+	if (out)
+		fclose(out);
+	return err;
+}
+
 /* this function is forced noinline and has short generic name to look better
  * in test_progs output (in case of a failure)
  */
@@ -695,7 +762,16 @@ void run_subtest(struct test_loader *tester,
 		}
 	}
 	emit_verifier_log(tester->log_buf, false /*force*/);
-	validate_msgs(tester->log_buf, &subspec->expect_msgs);
+	validate_msgs(tester->log_buf, &subspec->expect_msgs, emit_verifier_log);
+
+	if (subspec->expect_xlated.cnt) {
+		err = get_xlated_program_text(bpf_program__fd(tprog),
+					      tester->log_buf, tester->log_buf_sz);
+		if (err)
+			goto tobj_cleanup;
+		emit_xlated(tester->log_buf, false /*force*/);
+		validate_msgs(tester->log_buf, &subspec->expect_xlated, emit_xlated);
+	}
 
 	if (should_do_test_run(spec, subspec)) {
 		/* For some reason test_verifier executes programs
-- 
cgit v1.2.3


From ee7fe84468b1732fe65c5af3836437d54ac4c419 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 22 Jul 2024 16:38:43 -0700
Subject: selftests/bpf: __arch_* macro to limit test cases to specific archs

Add annotations __arch_x86_64, __arch_arm64, __arch_riscv64
to specify on which architecture the test case should be tested.
Several __arch_* annotations could be specified at once.
When test case is not run on current arch it is marked as skipped.

For example, the following would be tested only on arm64 and riscv64:

  SEC("raw_tp")
  __arch_arm64
  __arch_riscv64
  __xlated("1: *(u64 *)(r10 - 16) = r1")
  __xlated("2: call")
  __xlated("3: r1 = *(u64 *)(r10 - 16);")
  __success
  __naked void canary_arm64_riscv64(void)
  {
  	asm volatile (
  	"r1 = 1;"
  	"*(u64 *)(r10 - 16) = r1;"
  	"call %[bpf_get_smp_processor_id];"
  	"r1 = *(u64 *)(r10 - 16);"
  	"exit;"
  	:
  	: __imm(bpf_get_smp_processor_id)
  	: __clobber_all);
  }

On x86 it would be skipped:

  #467/2   verifier_nocsr/canary_arm64_riscv64:SKIP

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240722233844.1406874-10-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/progs/bpf_misc.h |  8 ++++++
 tools/testing/selftests/bpf/test_loader.c    | 43 ++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index a70939c7bc26..a225cd87897c 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -63,6 +63,10 @@
  * __auxiliary         Annotated program is not a separate test, but used as auxiliary
  *                     for some other test cases and should always be loaded.
  * __auxiliary_unpriv  Same, but load program in unprivileged mode.
+ *
+ * __arch_*          Specify on which architecture the test case should be tested.
+ *                   Several __arch_* annotations could be specified at once.
+ *                   When test case is not run on current arch it is marked as skipped.
  */
 #define __msg(msg)		__attribute__((btf_decl_tag("comment:test_expect_msg=" msg)))
 #define __regex(regex)		__attribute__((btf_decl_tag("comment:test_expect_regex=" regex)))
@@ -82,6 +86,10 @@
 #define __auxiliary		__attribute__((btf_decl_tag("comment:test_auxiliary")))
 #define __auxiliary_unpriv	__attribute__((btf_decl_tag("comment:test_auxiliary_unpriv")))
 #define __btf_path(path)	__attribute__((btf_decl_tag("comment:test_btf_path=" path)))
+#define __arch(arch)		__attribute__((btf_decl_tag("comment:test_arch=" arch)))
+#define __arch_x86_64		__arch("X86_64")
+#define __arch_arm64		__arch("ARM64")
+#define __arch_riscv64		__arch("RISCV64")
 
 /* Convenience macro for use with 'asm volatile' blocks */
 #define __naked __attribute__((naked))
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index b44b6a2fc82c..12b0c41e8d64 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -34,6 +34,7 @@
 #define TEST_TAG_AUXILIARY "comment:test_auxiliary"
 #define TEST_TAG_AUXILIARY_UNPRIV "comment:test_auxiliary_unpriv"
 #define TEST_BTF_PATH "comment:test_btf_path="
+#define TEST_TAG_ARCH "comment:test_arch="
 
 /* Warning: duplicated in bpf_misc.h */
 #define POINTER_VALUE	0xcafe4all
@@ -80,6 +81,7 @@ struct test_spec {
 	int log_level;
 	int prog_flags;
 	int mode_mask;
+	int arch_mask;
 	bool auxiliary;
 	bool valid;
 };
@@ -213,6 +215,12 @@ static void update_flags(int *flags, int flag, bool clear)
 		*flags |= flag;
 }
 
+enum arch {
+	ARCH_X86_64	= 0x1,
+	ARCH_ARM64	= 0x2,
+	ARCH_RISCV64	= 0x4,
+};
+
 /* Uses btf_decl_tag attributes to describe the expected test
  * behavior, see bpf_misc.h for detailed description of each attribute
  * and attribute combinations.
@@ -226,6 +234,7 @@ static int parse_test_spec(struct test_loader *tester,
 	bool has_unpriv_result = false;
 	bool has_unpriv_retval = false;
 	int func_id, i, err = 0;
+	u32 arch_mask = 0;
 	struct btf *btf;
 
 	memset(spec, 0, sizeof(*spec));
@@ -364,11 +373,26 @@ static int parse_test_spec(struct test_loader *tester,
 					goto cleanup;
 				update_flags(&spec->prog_flags, flags, clear);
 			}
+		} else if (str_has_pfx(s, TEST_TAG_ARCH)) {
+			val = s + sizeof(TEST_TAG_ARCH) - 1;
+			if (strcmp(val, "X86_64") == 0) {
+				arch_mask |= ARCH_X86_64;
+			} else if (strcmp(val, "ARM64") == 0) {
+				arch_mask |= ARCH_ARM64;
+			} else if (strcmp(val, "RISCV64") == 0) {
+				arch_mask |= ARCH_RISCV64;
+			} else {
+				PRINT_FAIL("bad arch spec: '%s'", val);
+				err = -EINVAL;
+				goto cleanup;
+			}
 		} else if (str_has_pfx(s, TEST_BTF_PATH)) {
 			spec->btf_custom_path = s + sizeof(TEST_BTF_PATH) - 1;
 		}
 	}
 
+	spec->arch_mask = arch_mask;
+
 	if (spec->mode_mask == 0)
 		spec->mode_mask = PRIV;
 
@@ -677,6 +701,20 @@ out:
 	return err;
 }
 
+static bool run_on_current_arch(int arch_mask)
+{
+	if (arch_mask == 0)
+		return true;
+#if defined(__x86_64__)
+	return arch_mask & ARCH_X86_64;
+#elif defined(__aarch64__)
+	return arch_mask & ARCH_ARM64;
+#elif defined(__riscv) && __riscv_xlen == 64
+	return arch_mask & ARCH_RISCV64;
+#endif
+	return false;
+}
+
 /* this function is forced noinline and has short generic name to look better
  * in test_progs output (in case of a failure)
  */
@@ -701,6 +739,11 @@ void run_subtest(struct test_loader *tester,
 	if (!test__start_subtest(subspec->name))
 		return;
 
+	if (!run_on_current_arch(spec->arch_mask)) {
+		test__skip();
+		return;
+	}
+
 	if (unpriv) {
 		if (!can_execute_unpriv(tester, spec)) {
 			test__skip();
-- 
cgit v1.2.3


From d0ad1f8f8846cffebca55abdd1ed275e276a6754 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 22 Jul 2024 16:38:44 -0700
Subject: selftests/bpf: test no_caller_saved_registers spill/fill removal

Tests for no_caller_saved_registers processing logic
(see verifier.c:match_and_mark_nocsr_pattern()):
- a canary positive test case;
- a canary test case for arm64 and riscv64;
- various tests with broken patterns;
- tests with read/write fixed/varying stack access that violate nocsr
  stack access contract;
- tests with multiple subprograms;
- tests using nocsr in combination with may_goto/bpf_loop,
  as all of these features affect stack depth;
- tests for nocsr stack spills below max stack depth.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240722233844.1406874-11-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |   2 +
 tools/testing/selftests/bpf/progs/verifier_nocsr.c | 796 +++++++++++++++++++++
 2 files changed, 798 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_nocsr.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index ff1c7da1d06e..67a49d12472c 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -53,6 +53,7 @@
 #include "verifier_movsx.skel.h"
 #include "verifier_netfilter_ctx.skel.h"
 #include "verifier_netfilter_retcode.skel.h"
+#include "verifier_nocsr.skel.h"
 #include "verifier_or_jmp32_k.skel.h"
 #include "verifier_precision.skel.h"
 #include "verifier_prevent_map_lookup.skel.h"
@@ -173,6 +174,7 @@ void test_verifier_meta_access(void)          { RUN(verifier_meta_access); }
 void test_verifier_movsx(void)                 { RUN(verifier_movsx); }
 void test_verifier_netfilter_ctx(void)        { RUN(verifier_netfilter_ctx); }
 void test_verifier_netfilter_retcode(void)    { RUN(verifier_netfilter_retcode); }
+void test_verifier_nocsr(void)                { RUN(verifier_nocsr); }
 void test_verifier_or_jmp32_k(void)           { RUN(verifier_or_jmp32_k); }
 void test_verifier_precision(void)            { RUN(verifier_precision); }
 void test_verifier_prevent_map_lookup(void)   { RUN(verifier_prevent_map_lookup); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_nocsr.c
new file mode 100644
index 000000000000..a7fe277e5167
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_nocsr.c
@@ -0,0 +1,796 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "../../../include/linux/filter.h"
+#include "bpf_misc.h"
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 8")
+__xlated("4: r5 = 5")
+__xlated("5: w0 = ")
+__xlated("6: r0 = &(void __percpu *)(r0)")
+__xlated("7: r0 = *(u32 *)(r0 +0)")
+__xlated("8: exit")
+__success
+__naked void simple(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = 2;"
+	"r3 = 3;"
+	"r4 = 4;"
+	"r5 = 5;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"*(u64 *)(r10 - 24) = r2;"
+	"*(u64 *)(r10 - 32) = r3;"
+	"*(u64 *)(r10 - 40) = r4;"
+	"*(u64 *)(r10 - 48) = r5;"
+	"call %[bpf_get_smp_processor_id];"
+	"r5 = *(u64 *)(r10 - 48);"
+	"r4 = *(u64 *)(r10 - 40);"
+	"r3 = *(u64 *)(r10 - 32);"
+	"r2 = *(u64 *)(r10 - 24);"
+	"r1 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+/* The logic for detecting and verifying nocsr pattern is the same for
+ * any arch, however x86 differs from arm64 or riscv64 in a way
+ * bpf_get_smp_processor_id is rewritten:
+ * - on x86 it is done by verifier
+ * - on arm64 and riscv64 it is done by jit
+ *
+ * Which leads to different xlated patterns for different archs:
+ * - on x86 the call is expanded as 3 instructions
+ * - on arm64 and riscv64 the call remains as is
+ *   (but spills/fills are still removed)
+ *
+ * It is really desirable to check instruction indexes in the xlated
+ * patterns, so add this canary test to check that function rewrite by
+ * jit is correctly processed by nocsr logic, keep the rest of the
+ * tests as x86.
+ */
+SEC("raw_tp")
+__arch_arm64
+__arch_riscv64
+__xlated("0: r1 = 1")
+__xlated("1: call bpf_get_smp_processor_id")
+__xlated("2: exit")
+__success
+__naked void canary_arm64_riscv64(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: r0 = &(void __percpu *)(r0)")
+__xlated("3: exit")
+__success
+__naked void canary_zero_spills(void)
+{
+	asm volatile (
+	"call %[bpf_get_smp_processor_id];"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 16")
+__xlated("1: *(u64 *)(r10 -16) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r2 = *(u64 *)(r10 -16)")
+__success
+__naked void wrong_reg_in_pattern1(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -16) = r6")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r6 = *(u64 *)(r10 -16)")
+__success
+__naked void wrong_reg_in_pattern2(void)
+{
+	asm volatile (
+	"r6 = 1;"
+	"*(u64 *)(r10 - 16) = r6;"
+	"call %[bpf_get_smp_processor_id];"
+	"r6 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -16) = r0")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r0 = *(u64 *)(r10 -16)")
+__success
+__naked void wrong_reg_in_pattern3(void)
+{
+	asm volatile (
+	"r0 = 1;"
+	"*(u64 *)(r10 - 16) = r0;"
+	"call %[bpf_get_smp_processor_id];"
+	"r0 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("2: *(u64 *)(r2 -16) = r1")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("6: r1 = *(u64 *)(r10 -16)")
+__success
+__naked void wrong_base_in_pattern(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = r10;"
+	"*(u64 *)(r2 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -16) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r2 = 1")
+__success
+__naked void wrong_insn_in_pattern(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = 1;"
+	"r1 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("2: *(u64 *)(r10 -16) = r1")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("6: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void wrong_off_in_pattern1(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u32 *)(r10 -4) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r1 = *(u32 *)(r10 -4)")
+__success
+__naked void wrong_off_in_pattern2(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u32 *)(r10 - 4) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u32 *)(r10 - 4);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u32 *)(r10 -16) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r1 = *(u32 *)(r10 -16)")
+__success
+__naked void wrong_size_in_pattern(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u32 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u32 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("2: *(u32 *)(r10 -8) = r1")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("6: r1 = *(u32 *)(r10 -8)")
+__success
+__naked void partial_pattern(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = 2;"
+	"*(u32 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r2;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = *(u64 *)(r10 - 16);"
+	"r1 = *(u32 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("0: r1 = 1")
+__xlated("1: r2 = 2")
+/* not patched, spills for -8, -16 not removed */
+__xlated("2: *(u64 *)(r10 -8) = r1")
+__xlated("3: *(u64 *)(r10 -16) = r2")
+__xlated("5: r0 = &(void __percpu *)(r0)")
+__xlated("7: r2 = *(u64 *)(r10 -16)")
+__xlated("8: r1 = *(u64 *)(r10 -8)")
+/* patched, spills for -24, -32 removed */
+__xlated("10: r0 = &(void __percpu *)(r0)")
+__xlated("12: exit")
+__success
+__naked void min_stack_offset(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = 2;"
+	/* this call won't be patched */
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r2;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = *(u64 *)(r10 - 16);"
+	"r1 = *(u64 *)(r10 - 8);"
+	/* this call would be patched */
+	"*(u64 *)(r10 - 24) = r1;"
+	"*(u64 *)(r10 - 32) = r2;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = *(u64 *)(r10 - 32);"
+	"r1 = *(u64 *)(r10 - 24);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void bad_fixed_read(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"r1 = *(u64 *)(r1 - 0);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void bad_fixed_write(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"*(u64 *)(r1 - 0) = r1;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("6: *(u64 *)(r10 -16) = r1")
+__xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("10: r1 = *(u64 *)(r10 -16)")
+__success
+__naked void bad_varying_read(void)
+{
+	asm volatile (
+	"r6 = *(u64 *)(r1 + 0);" /* random scalar value */
+	"r6 &= 0x7;"		 /* r6 range [0..7] */
+	"r6 += 0x2;"		 /* r6 range [2..9] */
+	"r7 = 0;"
+	"r7 -= r6;"		 /* r7 range [-9..-2] */
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"r1 = r10;"
+	"r1 += r7;"
+	"r1 = *(u8 *)(r1 - 0);" /* touches slot [-16..-9] where spills are stored */
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("6: *(u64 *)(r10 -16) = r1")
+__xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("10: r1 = *(u64 *)(r10 -16)")
+__success
+__naked void bad_varying_write(void)
+{
+	asm volatile (
+	"r6 = *(u64 *)(r1 + 0);" /* random scalar value */
+	"r6 &= 0x7;"		 /* r6 range [0..7] */
+	"r6 += 0x2;"		 /* r6 range [2..9] */
+	"r7 = 0;"
+	"r7 -= r6;"		 /* r7 range [-9..-2] */
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"r1 = r10;"
+	"r1 += r7;"
+	"*(u8 *)(r1 - 0) = r7;" /* touches slot [-16..-9] where spills are stored */
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void bad_write_in_subprog(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call bad_write_in_subprog_aux;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+__used
+__naked static void bad_write_in_subprog_aux(void)
+{
+	asm volatile (
+	"r0 = 1;"
+	"*(u64 *)(r1 - 0) = r0;"	/* invalidates nocsr contract for caller: */
+	"exit;"				/* caller stack at -8 used outside of the pattern */
+	::: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void bad_helper_write(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	/* nocsr pattern with stack offset -8 */
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"r2 = 1;"
+	"r3 = 42;"
+	/* read dst is fp[-8], thus nocsr rewrite not applied */
+	"call %[bpf_probe_read_kernel];"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_probe_read_kernel)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+/* main, not patched */
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__xlated("9: call pc+1")
+__xlated("10: exit")
+/* subprogram, patched */
+__xlated("11: r1 = 1")
+__xlated("13: r0 = &(void __percpu *)(r0)")
+__xlated("15: exit")
+__success
+__naked void invalidate_one_subprog(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"r1 = *(u64 *)(r1 - 0);"
+	"call invalidate_one_subprog_aux;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+__used
+__naked static void invalidate_one_subprog_aux(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+/* main */
+__xlated("0: r1 = 1")
+__xlated("2: r0 = &(void __percpu *)(r0)")
+__xlated("4: call pc+1")
+__xlated("5: exit")
+/* subprogram */
+__xlated("6: r1 = 1")
+__xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("10: *(u64 *)(r10 -16) = r1")
+__xlated("11: exit")
+__success
+__naked void subprogs_use_independent_offsets(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"call subprogs_use_independent_offsets_aux;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+__used
+__naked static void subprogs_use_independent_offsets_aux(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 24) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 24);"
+	"*(u64 *)(r10 - 16) = r1;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 8")
+__xlated("2: r0 = &(void __percpu *)(r0)")
+__success
+__naked void helper_call_does_not_prevent_nocsr(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 16")
+/* may_goto counter at -16 */
+__xlated("0: *(u64 *)(r10 -16) =")
+__xlated("1: r1 = 1")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+/* may_goto expansion starts */
+__xlated("5: r11 = *(u64 *)(r10 -16)")
+__xlated("6: if r11 == 0x0 goto pc+3")
+__xlated("7: r11 -= 1")
+__xlated("8: *(u64 *)(r10 -16) = r11")
+/* may_goto expansion ends */
+__xlated("9: *(u64 *)(r10 -8) = r1")
+__xlated("10: exit")
+__success
+__naked void may_goto_interaction(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	".8byte %[may_goto];"
+	/* just touch some stack at -8 */
+	"*(u64 *)(r10 - 8) = r1;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id),
+	  __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0))
+	: __clobber_all);
+}
+
+__used
+__naked static void dummy_loop_callback(void)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 32+0")
+__xlated("2: r1 = 1")
+__xlated("3: w0 =")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("5: r0 = *(u32 *)(r0 +0)")
+/* bpf_loop params setup */
+__xlated("6: r2 =")
+__xlated("7: r3 = 0")
+__xlated("8: r4 = 0")
+/* ... part of the inlined bpf_loop */
+__xlated("12: *(u64 *)(r10 -32) = r6")
+__xlated("13: *(u64 *)(r10 -24) = r7")
+__xlated("14: *(u64 *)(r10 -16) = r8")
+/* ... */
+__xlated("21: call pc+8") /* dummy_loop_callback */
+/* ... last insns of the bpf_loop_interaction1 */
+__xlated("28: r0 = 0")
+__xlated("29: exit")
+/* dummy_loop_callback */
+__xlated("30: r0 = 0")
+__xlated("31: exit")
+__success
+__naked int bpf_loop_interaction1(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	/* nocsr stack region at -16, but could be removed */
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"r2 = %[dummy_loop_callback];"
+	"r3 = 0;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_ptr(dummy_loop_callback),
+	  __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_loop)
+	: __clobber_common
+	);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 40+0")
+/* call bpf_get_smp_processor_id */
+__xlated("2: r1 = 42")
+__xlated("3: w0 =")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("5: r0 = *(u32 *)(r0 +0)")
+/* call bpf_get_prandom_u32 */
+__xlated("6: *(u64 *)(r10 -16) = r1")
+__xlated("7: call")
+__xlated("8: r1 = *(u64 *)(r10 -16)")
+/* ... */
+/* ... part of the inlined bpf_loop */
+__xlated("15: *(u64 *)(r10 -40) = r6")
+__xlated("16: *(u64 *)(r10 -32) = r7")
+__xlated("17: *(u64 *)(r10 -24) = r8")
+__success
+__naked int bpf_loop_interaction2(void)
+{
+	asm volatile (
+	"r1 = 42;"
+	/* nocsr stack region at -16, cannot be removed */
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"r2 = %[dummy_loop_callback];"
+	"r3 = 0;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_ptr(dummy_loop_callback),
+	  __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_get_prandom_u32),
+	  __imm(bpf_loop)
+	: __clobber_common
+	);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4)
+__msg("stack depth 512+0")
+/* just to print xlated version when debugging */
+__xlated("r0 = &(void __percpu *)(r0)")
+__success
+/* cumulative_stack_depth() stack usage is MAX_BPF_STACK,
+ * called subprogram uses an additional slot for nocsr spill/fill,
+ * since nocsr spill/fill could be removed the program still fits
+ * in MAX_BPF_STACK and should be accepted.
+ */
+__naked int cumulative_stack_depth(void)
+{
+	asm volatile(
+	"r1 = 42;"
+	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
+	"call cumulative_stack_depth_subprog;"
+	"exit;"
+	:
+	: __imm_const(max_bpf_stack, MAX_BPF_STACK)
+	: __clobber_all
+	);
+}
+
+__used
+__naked static void cumulative_stack_depth_subprog(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:: __imm(bpf_get_smp_processor_id) : __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4)
+__msg("stack depth 512")
+__xlated("0: r1 = 42")
+__xlated("1: *(u64 *)(r10 -512) = r1")
+__xlated("2: w0 = ")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("4: r0 = *(u32 *)(r0 +0)")
+__xlated("5: exit")
+__success
+__naked int nocsr_max_stack_ok(void)
+{
+	asm volatile(
+	"r1 = 42;"
+	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
+	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
+	"exit;"
+	:
+	: __imm_const(max_bpf_stack, MAX_BPF_STACK),
+	  __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8),
+	  __imm(bpf_get_smp_processor_id)
+	: __clobber_all
+	);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4)
+__msg("stack depth 520")
+__failure
+__naked int nocsr_max_stack_fail(void)
+{
+	asm volatile(
+	"r1 = 42;"
+	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
+	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
+	/* call to prandom blocks nocsr rewrite */
+	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
+	"exit;"
+	:
+	: __imm_const(max_bpf_stack, MAX_BPF_STACK),
+	  __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8),
+	  __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_get_prandom_u32)
+	: __clobber_all
+	);
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 6b376e7543dd15faec4e3878bc1a187126532985 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Tue, 23 Jul 2024 20:57:43 +0000
Subject: selftests/bpf: Make %.test.d prerequisite order only

%.test.o should depend on %.test.d order-only to avoid unnecessary
recompilations due to compiler dumping .d and .o files in random
order.

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/all/gSoCpn9qV5K0hRvrvYlrw2StRntsvZcrUuDfkZUh1Ang9E6yZ9XJGYDuIP9iCuM2YTVhSEzEXCteQ94_0uIUjx_mXwupFJt64NJaiMr99a0=@pm.me
Link: https://lore.kernel.org/all/FnnOUuDMmf0SebqA1bb0fQIW4vguOZ-VcAlPnPMnmT2lJYxMMxFAhcgh77px8MsPS5Fr01I0YQxLJClEJTFWHdpaTBVSQhlmsVTcEsNQbV4=@pm.me
Link: https://lore.kernel.org/bpf/yyjJRl5LODbI4-FseU0wIP5e4ik0zAy7Sy-5eGwrzG_UanI8rwWlQPfXAFnn_27hoZFogoUHRSWxFsLk7hPr0b6P5TZ3cRrM30_ggnu555M=@pm.me
---
 tools/testing/selftests/bpf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 888ba68e6592..083f413a9c98 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -598,7 +598,7 @@ endif
 # Note: we cd into output directory to ensure embedded BPF object is found
 $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o:			\
 		      $(TRUNNER_TESTS_DIR)/%.c				\
-		      $(TRUNNER_OUTPUT)/%.test.d
+		      | $(TRUNNER_OUTPUT)/%.test.d
 	$$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@)
 	$(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
 
-- 
cgit v1.2.3


From ec4fe2f0fa12fd2d0115df7e58414dc26899cc5e Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:28 -0700
Subject: selftests/bpf: Use pid_t consistently in test_progs.c

Use pid_t rather than __pid_t when allocating memory for 'worker_pids' in
'struct test_env', as this is its declared type and also avoids compile
errors seen building against musl libc on mipsel64:

  test_progs.c:1738:49: error: '__pid_t' undeclared (first use in this function); did you mean 'pid_t'?
   1738 |                 env.worker_pids = calloc(sizeof(__pid_t), env.workers);
        |                                                 ^~~~~~~
        |                                                 pid_t
  test_progs.c:1738:49: note: each undeclared identifier is reported only once for each function it appears in

Fixes: 91b2c0afd00c ("selftests/bpf: Add parallelism to test_progs")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Geliang Tang <geliang@kernel.org>
Link: https://lore.kernel.org/bpf/c6447da51a94babc1931711a43e2ceecb135c93d.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/test_progs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 89ff704e9dad..60c5ec0f6abf 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1731,7 +1731,7 @@ int main(int argc, char **argv)
 	/* launch workers if requested */
 	env.worker_id = -1; /* main process */
 	if (env.workers) {
-		env.worker_pids = calloc(sizeof(__pid_t), env.workers);
+		env.worker_pids = calloc(sizeof(pid_t), env.workers);
 		env.worker_socks = calloc(sizeof(int), env.workers);
 		if (env.debug)
 			fprintf(stdout, "Launching %d workers.\n", env.workers);
-- 
cgit v1.2.3


From d393f9479d4aaab0fa4c3caf513f28685e831f13 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:29 -0700
Subject: selftests/bpf: Fix compile error from rlim_t in sk_storage_map.c

Cast 'rlim_t' argument to match expected type of printf() format and avoid
compile errors seen building for mips64el/musl-libc:

  In file included from map_tests/sk_storage_map.c:20:
  map_tests/sk_storage_map.c: In function 'test_sk_storage_map_stress_free':
  map_tests/sk_storage_map.c:414:56: error: format '%lu' expects argument of type 'long unsigned int', but argument 2 has type 'rlim_t' {aka 'long long unsigned int'} [-Werror=format=]
    414 |                 CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d",
        |                                                        ^~~~~~~~~~~~~~~~~~~~~~~
    415 |                       rlim_new.rlim_cur, errno);
        |                       ~~~~~~~~~~~~~~~~~
        |                               |
        |                               rlim_t {aka long long unsigned int}
  ./test_maps.h:12:24: note: in definition of macro 'CHECK'
     12 |                 printf(format);                                         \
        |                        ^~~~~~
  map_tests/sk_storage_map.c:414:68: note: format string is defined here
    414 |                 CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d",
        |                                                                  ~~^
        |                                                                    |
        |                                                                    long unsigned int
        |                                                                  %llu
  cc1: all warnings being treated as errors

Fixes: 51a0e301a563 ("bpf: Add BPF_MAP_TYPE_SK_STORAGE test to test_maps")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1e00a1fa7acf91b4ca135c4102dc796d518bad86.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/map_tests/sk_storage_map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
index 18405c3b7cee..af10c309359a 100644
--- a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
+++ b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
@@ -412,7 +412,7 @@ static void test_sk_storage_map_stress_free(void)
 		rlim_new.rlim_max = rlim_new.rlim_cur + 128;
 		err = setrlimit(RLIMIT_NOFILE, &rlim_new);
 		CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d",
-		      rlim_new.rlim_cur, errno);
+		      (unsigned long) rlim_new.rlim_cur, errno);
 	}
 
 	err = do_sk_storage_map_stress_free();
-- 
cgit v1.2.3


From 7b10f0c227ce3fa055d601f058dc411092a62a78 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:30 -0700
Subject: selftests/bpf: Fix error compiling bpf_iter_setsockopt.c with musl
 libc

Existing code calls getsockname() with a 'struct sockaddr_in6 *' argument
where a 'struct sockaddr *' argument is declared, yielding compile errors
when building for mips64el/musl-libc:

  bpf_iter_setsockopt.c: In function 'get_local_port':
  bpf_iter_setsockopt.c:98:30: error: passing argument 2 of 'getsockname' from incompatible pointer type [-Werror=incompatible-pointer-types]
     98 |         if (!getsockname(fd, &addr, &addrlen))
        |                              ^~~~~
        |                              |
        |                              struct sockaddr_in6 *
  In file included from .../netinet/in.h:10,
                   from .../arpa/inet.h:9,
                   from ./test_progs.h:17,
                   from bpf_iter_setsockopt.c:5:
  .../sys/socket.h:391:23: note: expected 'struct sockaddr * restrict' but argument is of type 'struct sockaddr_in6 *'
    391 | int getsockname (int, struct sockaddr *__restrict, socklen_t *__restrict);
        |                       ^
  cc1: all warnings being treated as errors

This compiled under glibc only because the argument is declared to be a
"funky" transparent union which includes both types above. Explicitly cast
the argument to allow compiling for both musl and glibc.

Fixes: eed92afdd14c ("bpf: selftest: Test batching and bpf_(get|set)sockopt in bpf tcp iter")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Geliang Tang <geliang@kernel.org>
Link: https://lore.kernel.org/bpf/f41def0f17b27a23b1709080e4e3f37f4cc11ca9.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c
index b52ff8ce34db..16bed9dd8e6a 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c
@@ -95,7 +95,7 @@ static unsigned short get_local_port(int fd)
 	struct sockaddr_in6 addr;
 	socklen_t addrlen = sizeof(addr);
 
-	if (!getsockname(fd, &addr, &addrlen))
+	if (!getsockname(fd, (struct sockaddr *)&addr, &addrlen))
 		return ntohs(addr.sin6_port);
 
 	return 0;
-- 
cgit v1.2.3


From 69f409469c9b1515a5db40d5a36fda372376fa2d Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:31 -0700
Subject: selftests/bpf: Drop unneeded error.h includes

The addition of general support for unprivileged tests in test_loader.c
breaks building test_verifier on non-glibc (e.g. musl) systems, due to the
inclusion of glibc extension '<error.h>' in 'unpriv_helpers.c'. However,
the header is actually not needed, so remove it to restore building.

Similarly for sk_lookup.c and flow_dissector.c, error.h is not necessary
and causes problems, so drop them.

Fixes: 1d56ade032a4 ("selftests/bpf: Unprivileged tests for test_loader.c")
Fixes: 0ab5539f8584 ("selftests/bpf: Tests for BPF_SK_LOOKUP attach point")
Fixes: 0905beec9f52 ("selftests/bpf: run flow dissector tests in skb-less mode")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/5664367edf5fea4f3f4b4aec3b182bcfc6edff9c.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 1 -
 tools/testing/selftests/bpf/prog_tests/sk_lookup.c      | 1 -
 tools/testing/selftests/bpf/unpriv_helpers.c            | 1 -
 3 files changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 9e5f38739104..9625e6d21791 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
 #include <network_helpers.h>
-#include <error.h>
 #include <linux/if_tun.h>
 #include <sys/uio.h>
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index 96f3863da8bc..023c31bde229 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -18,7 +18,6 @@
 #include <arpa/inet.h>
 #include <assert.h>
 #include <errno.h>
-#include <error.h>
 #include <fcntl.h>
 #include <sched.h>
 #include <stdio.h>
diff --git a/tools/testing/selftests/bpf/unpriv_helpers.c b/tools/testing/selftests/bpf/unpriv_helpers.c
index b6d016461fb0..220f6a963813 100644
--- a/tools/testing/selftests/bpf/unpriv_helpers.c
+++ b/tools/testing/selftests/bpf/unpriv_helpers.c
@@ -2,7 +2,6 @@
 
 #include <stdbool.h>
 #include <stdlib.h>
-#include <error.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
-- 
cgit v1.2.3


From d44c93fc2f5a0c47b23fa03d374e45259abd92d2 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:34 -0700
Subject: selftests/bpf: Fix missing ARRAY_SIZE() definition in bench.c

Add a "bpf_util.h" include to avoid the following error seen compiling for
mips64el with musl libc:

  bench.c: In function 'find_benchmark':
  bench.c:590:25: error: implicit declaration of function 'ARRAY_SIZE' [-Werror=implicit-function-declaration]
    590 |         for (i = 0; i < ARRAY_SIZE(benchs); i++) {
        |                         ^~~~~~~~~~
  cc1: all warnings being treated as errors

Fixes: 8e7c2a023ac0 ("selftests/bpf: Add benchmark runner infrastructure")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/bc4dde77dfcd17a825d8f28f72f3292341966810.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/bench.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 627b74ae041b..90dc3aca32bd 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -10,6 +10,7 @@
 #include <sys/sysinfo.h>
 #include <signal.h>
 #include "bench.h"
+#include "bpf_util.h"
 #include "testing_helpers.h"
 
 struct env env = {
-- 
cgit v1.2.3


From a2c155131b710959beb508ca6a54769b6b1bd488 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:35 -0700
Subject: selftests/bpf: Fix missing UINT_MAX definitions in benchmarks

Include <limits.h> in 'bench.h' to provide a UINT_MAX definition and avoid
multiple compile errors against mips64el/musl-libc like:

  benchs/bench_local_storage.c: In function 'parse_arg':
  benchs/bench_local_storage.c:40:38: error: 'UINT_MAX' undeclared (first use in this function)
     40 |                 if (ret < 1 || ret > UINT_MAX) {
        |                                      ^~~~~~~~
  benchs/bench_local_storage.c:11:1: note: 'UINT_MAX' is defined in header '<limits.h>'; did you forget to '#include <limits.h>'?
     10 | #include <test_btf.h>
    +++ |+#include <limits.h>
     11 |

seen with bench_local_storage.c, bench_local_storage_rcu_tasks_trace.c, and
bench_bpf_hashmap_lookup.c.

Fixes: 73087489250d ("selftests/bpf: Add benchmark for local_storage get")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/8f64a9d9fcff40a7fca090a65a68a9b62a468e16.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/bench.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index 68180d8f8558..005c401b3e22 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -10,6 +10,7 @@
 #include <math.h>
 #include <time.h>
 #include <sys/syscall.h>
+#include <limits.h>
 
 struct cpu_set {
 	bool *cpus;
-- 
cgit v1.2.3


From 6495eb79ca7d15bd87c38d77307e8f9b6b7bf4ef Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:36 -0700
Subject: selftests/bpf: Fix missing BUILD_BUG_ON() declaration

Explicitly include '<linux/build_bug.h>' to fix errors seen compiling with
gcc targeting mips64el/musl-libc:

  user_ringbuf.c: In function 'test_user_ringbuf_loop':
  user_ringbuf.c:426:9: error: implicit declaration of function 'BUILD_BUG_ON' [-Werror=implicit-function-declaration]
    426 |         BUILD_BUG_ON(total_samples <= c_max_entries);
        |         ^~~~~~~~~~~~
  cc1: all warnings being treated as errors

Fixes: e5a9df51c746 ("selftests/bpf: Add selftests validating the user ringbuf")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/b28575f9221ec54871c46a2e87612bb4bbf46ccd.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/user_ringbuf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
index e51721df14fc..dfff6feac12c 100644
--- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
@@ -4,6 +4,7 @@
 #define _GNU_SOURCE
 #include <linux/compiler.h>
 #include <linux/ring_buffer.h>
+#include <linux/build_bug.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
-- 
cgit v1.2.3


From 21f0b0af977203220ad58aff95e372151288ec47 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:37 -0700
Subject: selftests/bpf: Fix include of <sys/fcntl.h>

Update ns_current_pid_tgid.c to use '#include <fcntl.h>' and avoid compile
error against mips64el/musl libc:

  In file included from .../prog_tests/ns_current_pid_tgid.c:14:
  .../include/sys/fcntl.h:1:2: error: #warning redirecting incorrect #include <sys/fcntl.h> to <fcntl.h> [-Werror=cpp]
      1 | #warning redirecting incorrect #include <sys/fcntl.h> to <fcntl.h>
        |  ^~~~~~~
  cc1: all warnings being treated as errors

Fixes: 09c02d553c49 ("bpf, selftests: Fold test_current_pid_tgid_new_ns into test_progs.")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/8bdc869749177b575025bf69600a4ce591822609.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
index e72d75d6baa7..c29787e092d6 100644
--- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
+++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
@@ -11,7 +11,7 @@
 #include <sched.h>
 #include <sys/wait.h>
 #include <sys/mount.h>
-#include <sys/fcntl.h>
+#include <fcntl.h>
 #include "network_helpers.h"
 
 #define STACK_SIZE (1024 * 1024)
-- 
cgit v1.2.3


From 4c329b99ef9c118343379bde9f97e8ce5cac9fc9 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:38 -0700
Subject: selftests/bpf: Fix compiling parse_tcp_hdr_opt.c with musl-libc

The GNU version of 'struct tcphdr', with members 'doff' and 'urg_ptr', is
not exposed by musl headers unless _GNU_SOURCE is defined.

Add this definition to fix errors seen compiling for mips64el/musl-libc:

  parse_tcp_hdr_opt.c:18:21: error: 'struct tcphdr' has no member named 'urg_ptr'
     18 |         .pk6_v6.tcp.urg_ptr = 123,
        |                     ^~~~~~~
  parse_tcp_hdr_opt.c:19:21: error: 'struct tcphdr' has no member named 'doff'
     19 |         .pk6_v6.tcp.doff = 9, /* 16 bytes of options */
        |                     ^~~~

Fixes: cfa7b011894d ("selftests/bpf: tests for using dynptrs to parse skb and xdp buffers")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/ac5440213c242c62cb4e0d9e0a9cd5058b6a31f6.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c b/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c
index daa952711d8f..e9c07d561ded 100644
--- a/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c
+++ b/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#define _GNU_SOURCE
 #include <test_progs.h>
 #include <network_helpers.h>
 #include "test_parse_tcp_hdr_opt.skel.h"
-- 
cgit v1.2.3


From bae9a5ce7d3a9b3a9e07b31ab9e9c58450e3e9fd Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:39 -0700
Subject: selftests/bpf: Fix compiling kfree_skb.c with musl-libc

The GNU version of 'struct tcphdr' with member 'doff' is not exposed by
musl headers unless _GNU_SOURCE is defined. Add this definition to fix
errors seen compiling for mips64el/musl-libc:

  In file included from kfree_skb.c:2:
  kfree_skb.c: In function 'on_sample':
  kfree_skb.c:45:30: error: 'struct tcphdr' has no member named 'doff'
     45 |         if (CHECK(pkt_v6->tcp.doff != 5, "check_tcp",
        |                              ^

Fixes: 580d656d80cf ("selftests/bpf: Add kfree_skb raw_tp test")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/e2d8cedc790959c10d6822a51f01a7a3616bea1b.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/kfree_skb.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
index c07991544a78..34f8822fd221 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
 #include <test_progs.h>
 #include <network_helpers.h>
 #include "kfree_skb.skel.h"
-- 
cgit v1.2.3


From 5e4c43bcb85973243d7274e0058b6e8f5810e4f7 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:40 -0700
Subject: selftests/bpf: Fix compiling flow_dissector.c with musl-libc

The GNU version of 'struct tcphdr' has members 'doff', 'source' and 'dest',
which are not exposed by musl libc headers unless _GNU_SOURCE is defined.

Add this definition to fix errors seen compiling for mips64el/musl-libc:

  flow_dissector.c:118:30: error: 'struct tcphdr' has no member named 'doff'
    118 |                         .tcp.doff = 5,
        |                              ^~~~
  flow_dissector.c:119:30: error: 'struct tcphdr' has no member named 'source'
    119 |                         .tcp.source = 80,
        |                              ^~~~~~
  flow_dissector.c:120:30: error: 'struct tcphdr' has no member named 'dest'
    120 |                         .tcp.dest = 8080,
        |                              ^~~~

Fixes: ae173a915785 ("selftests/bpf: support BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/8f7ab21a73f678f9cebd32b26c444a686e57414d.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 9625e6d21791..3171047414a7 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
 #include <test_progs.h>
 #include <network_helpers.h>
 #include <linux/if_tun.h>
-- 
cgit v1.2.3


From 18826fb0b79c3c3cd1fe765d85f9c6f1a902c722 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:41 -0700
Subject: selftests/bpf: Fix compiling tcp_rtt.c with musl-libc

The GNU version of 'struct tcp_info' in 'netinet/tcp.h' is not exposed by
musl headers unless _GNU_SOURCE is defined.

Add this definition to fix errors seen compiling for mips64el/musl-libc:

  tcp_rtt.c: In function 'wait_for_ack':
  tcp_rtt.c:24:25: error: storage size of 'info' isn't known
     24 |         struct tcp_info info;
        |                         ^~~~
  tcp_rtt.c:24:25: error: unused variable 'info' [-Werror=unused-variable]
  cc1: all warnings being treated as errors

Fixes: 1f4f80fed217 ("selftests/bpf: test_progs: convert test_tcp_rtt")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/f2329767b15df206f08a5776d35a47c37da855ae.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
index f2b99d95d916..c38784c1c066 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
 #include <test_progs.h>
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
-- 
cgit v1.2.3


From debfa4f628f271f72933bf38d581cc53cfe1def5 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:42 -0700
Subject: selftests/bpf: Fix compiling core_reloc.c with musl-libc

The type 'loff_t' is a GNU extension and not exposed by the musl 'fcntl.h'
header unless _GNU_SOURCE is defined. Add this definition to fix errors
seen compiling for mips64el/musl-libc:

  In file included from tools/testing/selftests/bpf/prog_tests/core_reloc.c:4:
  ./bpf_testmod/bpf_testmod.h:10:9: error: unknown type name 'loff_t'
     10 |         loff_t off;
        |         ^~~~~~
  ./bpf_testmod/bpf_testmod.h:16:9: error: unknown type name 'loff_t'
     16 |         loff_t off;
        |         ^~~~~~

Fixes: 6bcd39d366b6 ("selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/11c3af75a7eb6bcb7ad9acfae6a6f470c572eb82.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/core_reloc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 47f42e680105..26019313e1fc 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
 #include <test_progs.h>
 #include "progs/core_reloc_types.h"
 #include "bpf_testmod/bpf_testmod.h"
-- 
cgit v1.2.3


From 27c4797ce51c8dd51e35e68e9024a892f62d78b2 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:43 -0700
Subject: selftests/bpf: Fix errors compiling lwt_redirect.c with musl libc

Remove a redundant include of '<linux/icmp.h>' which is already provided in
'lwt_helpers.h'. This avoids errors seen compiling for mips64el/musl-libc:

  In file included from .../arpa/inet.h:9,
                   from lwt_redirect.c:51:
  .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr'
     23 | struct in6_addr {
        |        ^~~~~~~~
  In file included from .../linux/icmp.h:24,
                   from lwt_redirect.c:50:
  .../linux/in6.h:33:8: note: originally defined here
     33 | struct in6_addr {
        |        ^~~~~~~~
  .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6'
     34 | struct sockaddr_in6 {
        |        ^~~~~~~~~~~~
  .../linux/in6.h:50:8: note: originally defined here
     50 | struct sockaddr_in6 {
        |        ^~~~~~~~~~~~
  .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq'
     42 | struct ipv6_mreq {
        |        ^~~~~~~~~
  .../linux/in6.h:60:8: note: originally defined here
     60 | struct ipv6_mreq {
        |        ^~~~~~~~~

Fixes: 43a7c3ef8a15 ("selftests/bpf: Add lwt_xmit tests for BPF_REDIRECT")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/3869dda876d5206d2f8d4dd67331c739ceb0c7f8.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/lwt_redirect.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c
index 835a1d756c16..b6e8d822e8e9 100644
--- a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c
@@ -47,7 +47,6 @@
 #include <linux/if_ether.h>
 #include <linux/if_packet.h>
 #include <linux/if_tun.h>
-#include <linux/icmp.h>
 #include <arpa/inet.h>
 #include <unistd.h>
 #include <errno.h>
-- 
cgit v1.2.3


From 1b00f355130a5dfc38a01ad02458ae2cb2ebe609 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:44 -0700
Subject: selftests/bpf: Fix errors compiling decap_sanity.c with musl libc

Remove a redundant include of '<linux/in6.h>', whose needed definitions are
already provided by 'test_progs.h'. This avoids errors seen compiling for
mips64el/musl-libc:

  In file included from .../arpa/inet.h:9,
                   from ./test_progs.h:17,
                   from prog_tests/decap_sanity.c:9:
  .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr'
     23 | struct in6_addr {
        |        ^~~~~~~~
  In file included from decap_sanity.c:7:
  .../linux/in6.h:33:8: note: originally defined here
     33 | struct in6_addr {
        |        ^~~~~~~~
  .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6'
     34 | struct sockaddr_in6 {
        |        ^~~~~~~~~~~~
  .../linux/in6.h:50:8: note: originally defined here
     50 | struct sockaddr_in6 {
        |        ^~~~~~~~~~~~
  .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq'
     42 | struct ipv6_mreq {
        |        ^~~~~~~~~
  .../linux/in6.h:60:8: note: originally defined here
     60 | struct ipv6_mreq {
        |        ^~~~~~~~~

Fixes: 70a00e2f1dba ("selftests/bpf: Test bpf_skb_adjust_room on CHECKSUM_PARTIAL")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/e986ba2d7edccd254b54f7cd049b98f10bafa8c3.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/decap_sanity.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c
index dcb9e5070cc3..d79f398ec6b7 100644
--- a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c
+++ b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c
@@ -4,7 +4,6 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <net/if.h>
-#include <linux/in6.h>
 
 #include "test_progs.h"
 #include "network_helpers.h"
-- 
cgit v1.2.3


From 9822be702fe6e1c3e0933ef4b68a8c56683d930d Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:45 -0700
Subject: selftests/bpf: Fix errors compiling crypto_sanity.c with musl libc

Remove a redundant include of '<linux/in6.h>', whose needed definitions are
already provided by 'test_progs.h'. This avoids errors seen compiling for
mips64el/musl-libc:

  In file included from .../arpa/inet.h:9,
                   from ./test_progs.h:17,
                   from prog_tests/crypto_sanity.c:10:
  .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr'
     23 | struct in6_addr {
        |        ^~~~~~~~
  In file included from crypto_sanity.c:7:
  .../linux/in6.h:33:8: note: originally defined here
     33 | struct in6_addr {
        |        ^~~~~~~~
  .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6'
     34 | struct sockaddr_in6 {
        |        ^~~~~~~~~~~~
  .../linux/in6.h:50:8: note: originally defined here
     50 | struct sockaddr_in6 {
        |        ^~~~~~~~~~~~
  .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq'
     42 | struct ipv6_mreq {
        |        ^~~~~~~~~
  .../linux/in6.h:60:8: note: originally defined here
     60 | struct ipv6_mreq {
        |        ^~~~~~~~~

Fixes: 91541ab192fc ("selftests: bpf: crypto skcipher algo selftests")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://lore.kernel.org/bpf/911293968f424ad7b462d8805aeb3baee8f4985b.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/crypto_sanity.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c
index b1a3a49a822a..42bd07f7218d 100644
--- a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c
+++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c
@@ -4,7 +4,6 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <net/if.h>
-#include <linux/in6.h>
 #include <linux/if_alg.h>
 
 #include "test_progs.h"
-- 
cgit v1.2.3


From 730561d3c08d4a327cceaabf11365958a1c00cec Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 22 Jul 2024 22:54:46 -0700
Subject: selftests/bpf: Fix errors compiling cg_storage_multi.h with musl libc

Remove a redundant include of '<asm/types.h>', whose needed definitions are
already included (via '<linux/types.h>') in cg_storage_multi_egress_only.c,
cg_storage_multi_isolated.c, and cg_storage_multi_shared.c. This avoids
redefinition errors seen compiling for mips64el/musl-libc like:

  In file included from progs/cg_storage_multi_egress_only.c:13:
  In file included from progs/cg_storage_multi.h:6:
  In file included from /usr/mips64el-linux-gnuabi64/include/asm/types.h:23:
  /usr/include/asm-generic/int-l64.h:29:25: error: typedef redefinition with different types ('long' vs 'long long')
     29 | typedef __signed__ long __s64;
        |                         ^
  /usr/include/asm-generic/int-ll64.h:30:44: note: previous definition is here
     30 | __extension__ typedef __signed__ long long __s64;
        |                                            ^

Fixes: 9e5bd1f7633b ("selftests/bpf: Test CGROUP_STORAGE map can't be used by multiple progs")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/4f4702e9f6115b7f84fea01b2326ca24c6df7ba8.1721713597.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/progs/cg_storage_multi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi.h b/tools/testing/selftests/bpf/progs/cg_storage_multi.h
index a0778fe7857a..41d59f0ee606 100644
--- a/tools/testing/selftests/bpf/progs/cg_storage_multi.h
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi.h
@@ -3,8 +3,6 @@
 #ifndef __PROGS_CG_STORAGE_MULTI_H
 #define __PROGS_CG_STORAGE_MULTI_H
 
-#include <asm/types.h>
-
 struct cgroup_value {
 	__u32 egress_pkts;
 	__u32 ingress_pkts;
-- 
cgit v1.2.3


From 04a94133f1b3cccb19e056c26f056c50b4e5b3b1 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 24 Jul 2024 12:14:58 -0500
Subject: libbpf: Don't take direct pointers into BTF data from st_ops

In struct bpf_struct_ops, we have take a pointer to a BTF type name, and
a struct btf_type. This was presumably done for convenience, but can
actually result in subtle and confusing bugs given that BTF data can be
invalidated before a program is loaded. For example, in sched_ext, we
may sometimes resize a data section after a skeleton has been opened,
but before the struct_ops scheduler map has been loaded. This may cause
the BTF data to be realloc'd, which can then cause a UAF when loading
the program because the struct_ops map has pointers directly into the
BTF data.

We're already storing the BTF type_id in struct bpf_struct_ops. Because
type_id is stable, we can therefore just update the places where we were
looking at those pointers to instead do the lookups we need from the
type_id.

Fixes: 590a00888250 ("bpf: libbpf: Add STRUCT_OPS support")
Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240724171459.281234-1-void@manifault.com
---
 tools/lib/bpf/libbpf.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a3be6f8fac09..e55353887439 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -496,8 +496,6 @@ struct bpf_program {
 };
 
 struct bpf_struct_ops {
-	const char *tname;
-	const struct btf_type *type;
 	struct bpf_program **progs;
 	__u32 *kern_func_off;
 	/* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
@@ -1083,11 +1081,14 @@ static int bpf_object_adjust_struct_ops_autoload(struct bpf_object *obj)
 			continue;
 
 		for (j = 0; j < obj->nr_maps; ++j) {
+			const struct btf_type *type;
+
 			map = &obj->maps[j];
 			if (!bpf_map__is_struct_ops(map))
 				continue;
 
-			vlen = btf_vlen(map->st_ops->type);
+			type = btf__type_by_id(obj->btf, map->st_ops->type_id);
+			vlen = btf_vlen(type);
 			for (k = 0; k < vlen; ++k) {
 				slot_prog = map->st_ops->progs[k];
 				if (prog != slot_prog)
@@ -1121,8 +1122,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map)
 	int err;
 
 	st_ops = map->st_ops;
-	type = st_ops->type;
-	tname = st_ops->tname;
+	type = btf__type_by_id(btf, st_ops->type_id);
+	tname = btf__name_by_offset(btf, type->name_off);
 	err = find_struct_ops_kern_types(obj, tname, &mod_btf,
 					 &kern_type, &kern_type_id,
 					 &kern_vtype, &kern_vtype_id,
@@ -1423,8 +1424,6 @@ static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name,
 		memcpy(st_ops->data,
 		       data->d_buf + vsi->offset,
 		       type->size);
-		st_ops->tname = tname;
-		st_ops->type = type;
 		st_ops->type_id = type_id;
 
 		pr_debug("struct_ops init: struct %s(type_id=%u) %s found at offset %u\n",
@@ -8445,11 +8444,13 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
 
 static void bpf_map_prepare_vdata(const struct bpf_map *map)
 {
+	const struct btf_type *type;
 	struct bpf_struct_ops *st_ops;
 	__u32 i;
 
 	st_ops = map->st_ops;
-	for (i = 0; i < btf_vlen(st_ops->type); i++) {
+	type = btf__type_by_id(map->obj->btf, st_ops->type_id);
+	for (i = 0; i < btf_vlen(type); i++) {
 		struct bpf_program *prog = st_ops->progs[i];
 		void *kern_data;
 		int prog_fd;
@@ -9712,6 +9713,7 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj,
 static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
 					    Elf64_Shdr *shdr, Elf_Data *data)
 {
+	const struct btf_type *type;
 	const struct btf_member *member;
 	struct bpf_struct_ops *st_ops;
 	struct bpf_program *prog;
@@ -9771,13 +9773,14 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
 		}
 		insn_idx = sym->st_value / BPF_INSN_SZ;
 
-		member = find_member_by_offset(st_ops->type, moff * 8);
+		type = btf__type_by_id(btf, st_ops->type_id);
+		member = find_member_by_offset(type, moff * 8);
 		if (!member) {
 			pr_warn("struct_ops reloc %s: cannot find member at moff %u\n",
 				map->name, moff);
 			return -EINVAL;
 		}
-		member_idx = member - btf_members(st_ops->type);
+		member_idx = member - btf_members(type);
 		name = btf__name_by_offset(btf, member->name_off);
 
 		if (!resolve_func_ptr(btf, member->type, NULL)) {
-- 
cgit v1.2.3


From 0bfdda9db88990539b63a11028c84c4c069e8a0f Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Sun, 21 Jul 2024 21:33:03 +0200
Subject: selftests/bpf: Update xdp_redirect_map prog sections for libbpf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xdp_redirect_map.c is a bpf program used by test_xdp_veth.sh, which is not
handled by the generic test runner (test_progs). To allow converting this
test to test_progs, the corresponding program must be updated to allow
handling it through skeletons generated by bpftool and libbpf.

Update programs section names to allow to manipulate those with libbpf.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/bpf/20240721-convert_test_xdp_veth-v4-1-23bdba21b2f9@bootlin.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/progs/xdp_redirect_map.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c
index d037262c8937..682dda8dabbc 100644
--- a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c
+++ b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c
@@ -10,19 +10,19 @@ struct {
 	__uint(value_size, sizeof(int));
 } tx_port SEC(".maps");
 
-SEC("redirect_map_0")
+SEC("xdp")
 int xdp_redirect_map_0(struct xdp_md *xdp)
 {
 	return bpf_redirect_map(&tx_port, 0, 0);
 }
 
-SEC("redirect_map_1")
+SEC("xdp")
 int xdp_redirect_map_1(struct xdp_md *xdp)
 {
 	return bpf_redirect_map(&tx_port, 1, 0);
 }
 
-SEC("redirect_map_2")
+SEC("xdp")
 int xdp_redirect_map_2(struct xdp_md *xdp)
 {
 	return bpf_redirect_map(&tx_port, 2, 0);
-- 
cgit v1.2.3


From 41b01a0271fd0387171eb9ad4692c22e37c8c80a Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Sun, 21 Jul 2024 21:33:04 +0200
Subject: selftests/bpf: Integrate test_xdp_veth into test_progs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_xdp_veth.sh tests that XDP return codes work as expected, by bringing
up multiple veth pairs isolated in different namespaces, attaching specific
xdp programs to each interface, and ensuring that the whole chain allows to
ping one end interface from the first one. The test runs well but is
currently not integrated in test_progs, which prevents it from being run
automatically in the CI infrastructure.

Rewrite it as a C test relying on libbpf to allow running it in the CI
infrastructure. The new code brings up the same network infrastructure and
reuses the same eBPF programs as test_xdp_veth.sh, for which skeletons are
already generated by the bpf tests makefile.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/bpf/20240721-convert_test_xdp_veth-v4-2-23bdba21b2f9@bootlin.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/Makefile               |   1 -
 .../selftests/bpf/prog_tests/test_xdp_veth.c       | 213 +++++++++++++++++++++
 tools/testing/selftests/bpf/test_xdp_veth.sh       | 121 ------------
 3 files changed, 213 insertions(+), 122 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c
 delete mode 100755 tools/testing/selftests/bpf/test_xdp_veth.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 083f413a9c98..774c6270e377 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -115,7 +115,6 @@ TEST_PROGS := test_kmod.sh \
 	test_xdp_redirect.sh \
 	test_xdp_redirect_multi.sh \
 	test_xdp_meta.sh \
-	test_xdp_veth.sh \
 	test_tunnel.sh \
 	test_lwt_seg6local.sh \
 	test_lirc_mode2.sh \
diff --git a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c
new file mode 100644
index 000000000000..8d75424fe6bc
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Create 3 namespaces with 3 veth peers, and forward packets in-between using
+ * native XDP
+ *
+ *                      XDP_TX
+ * NS1(veth11)        NS2(veth22)        NS3(veth33)
+ *      |                  |                  |
+ *      |                  |                  |
+ *   (veth1,            (veth2,            (veth3,
+ *   id:111)            id:122)            id:133)
+ *     ^ |                ^ |                ^ |
+ *     | |  XDP_REDIRECT  | |  XDP_REDIRECT  | |
+ *     | ------------------ ------------------ |
+ *     -----------------------------------------
+ *                    XDP_REDIRECT
+ */
+
+#define _GNU_SOURCE
+#include <net/if.h>
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "xdp_dummy.skel.h"
+#include "xdp_redirect_map.skel.h"
+#include "xdp_tx.skel.h"
+
+#define VETH_PAIRS_COUNT	3
+#define NS_SUFFIX_LEN		6
+#define VETH_NAME_MAX_LEN	16
+#define IP_SRC				"10.1.1.11"
+#define IP_DST				"10.1.1.33"
+#define IP_CMD_MAX_LEN		128
+
+struct skeletons {
+	struct xdp_dummy *xdp_dummy;
+	struct xdp_tx *xdp_tx;
+	struct xdp_redirect_map *xdp_redirect_maps;
+};
+
+struct veth_configuration {
+	char local_veth[VETH_NAME_MAX_LEN]; /* Interface in main namespace */
+	char remote_veth[VETH_NAME_MAX_LEN]; /* Peer interface in dedicated namespace*/
+	const char *namespace; /* Namespace for the remote veth */
+	char next_veth[VETH_NAME_MAX_LEN]; /* Local interface to redirect traffic to */
+	char *remote_addr; /* IP address of the remote veth */
+};
+
+static struct veth_configuration config[VETH_PAIRS_COUNT] = {
+	{
+		.local_veth = "veth1",
+		.remote_veth = "veth11",
+		.next_veth = "veth2",
+		.remote_addr = IP_SRC,
+		.namespace = "ns-veth11"
+	},
+	{
+		.local_veth = "veth2",
+		.remote_veth = "veth22",
+		.next_veth = "veth3",
+		.remote_addr = NULL,
+		.namespace = "ns-veth22"
+	},
+	{
+		.local_veth = "veth3",
+		.remote_veth = "veth33",
+		.next_veth = "veth1",
+		.remote_addr = IP_DST,
+		.namespace = "ns-veth33"
+	}
+};
+
+static int attach_programs_to_veth_pair(struct skeletons *skeletons, int index)
+{
+	struct bpf_program *local_prog, *remote_prog;
+	struct bpf_link **local_link, **remote_link;
+	struct nstoken *nstoken;
+	struct bpf_link *link;
+	int interface;
+
+	switch (index) {
+	case 0:
+		local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_0;
+		local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_0;
+		remote_prog = skeletons->xdp_dummy->progs.xdp_dummy_prog;
+		remote_link = &skeletons->xdp_dummy->links.xdp_dummy_prog;
+		break;
+	case 1:
+		local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_1;
+		local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_1;
+		remote_prog = skeletons->xdp_tx->progs.xdp_tx;
+		remote_link = &skeletons->xdp_tx->links.xdp_tx;
+		break;
+	case 2:
+		local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_2;
+		local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_2;
+		remote_prog = skeletons->xdp_dummy->progs.xdp_dummy_prog;
+		remote_link = &skeletons->xdp_dummy->links.xdp_dummy_prog;
+		break;
+	}
+	interface = if_nametoindex(config[index].local_veth);
+	if (!ASSERT_NEQ(interface, 0, "non zero interface index"))
+		return -1;
+	link = bpf_program__attach_xdp(local_prog, interface);
+	if (!ASSERT_OK_PTR(link, "attach xdp program to local veth"))
+		return -1;
+	*local_link = link;
+	nstoken = open_netns(config[index].namespace);
+	if (!ASSERT_OK_PTR(nstoken, "switch to remote veth namespace"))
+		return -1;
+	interface = if_nametoindex(config[index].remote_veth);
+	if (!ASSERT_NEQ(interface, 0, "non zero interface index")) {
+		close_netns(nstoken);
+		return -1;
+	}
+	link = bpf_program__attach_xdp(remote_prog, interface);
+	*remote_link = link;
+	close_netns(nstoken);
+	if (!ASSERT_OK_PTR(link, "attach xdp program to remote veth"))
+		return -1;
+
+	return 0;
+}
+
+static int configure_network(struct skeletons *skeletons)
+{
+	int interface_id;
+	int map_fd;
+	int err;
+	int i = 0;
+
+	/* First create and configure all interfaces */
+	for (i = 0; i < VETH_PAIRS_COUNT; i++) {
+		SYS(fail, "ip netns add %s", config[i].namespace);
+		SYS(fail, "ip link add %s type veth peer name %s netns %s",
+		    config[i].local_veth, config[i].remote_veth, config[i].namespace);
+		SYS(fail, "ip link set dev %s up", config[i].local_veth);
+		if (config[i].remote_addr)
+			SYS(fail, "ip -n %s addr add %s/24 dev %s",	config[i].namespace,
+			    config[i].remote_addr, config[i].remote_veth);
+		SYS(fail, "ip -n %s link set dev %s up", config[i].namespace,
+		    config[i].remote_veth);
+	}
+
+	/* Then configure the redirect map and attach programs to interfaces */
+	map_fd = bpf_map__fd(skeletons->xdp_redirect_maps->maps.tx_port);
+	if (!ASSERT_GE(map_fd, 0, "open redirect map"))
+		goto fail;
+	for (i = 0; i < VETH_PAIRS_COUNT; i++) {
+		interface_id = if_nametoindex(config[i].next_veth);
+		if (!ASSERT_NEQ(interface_id, 0, "non zero interface index"))
+			goto fail;
+		err = bpf_map_update_elem(map_fd, &i, &interface_id, BPF_ANY);
+		if (!ASSERT_OK(err, "configure interface redirection through map"))
+			goto fail;
+		if (attach_programs_to_veth_pair(skeletons, i))
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	return -1;
+}
+
+static void cleanup_network(void)
+{
+	int i;
+
+	/* Deleting namespaces is enough to automatically remove veth pairs as well
+	 */
+	for (i = 0; i < VETH_PAIRS_COUNT; i++)
+		SYS_NOFAIL("ip netns del %s", config[i].namespace);
+}
+
+static int check_ping(struct skeletons *skeletons)
+{
+	/* Test: if all interfaces are properly configured, we must be able to ping
+	 * veth33 from veth11
+	 */
+	return SYS_NOFAIL("ip netns exec %s ping -c 1 -W 1 %s > /dev/null",
+					  config[0].namespace, IP_DST);
+}
+
+void test_xdp_veth_redirect(void)
+{
+	struct skeletons skeletons = {};
+
+	skeletons.xdp_dummy = xdp_dummy__open_and_load();
+	if (!ASSERT_OK_PTR(skeletons.xdp_dummy, "xdp_dummy__open_and_load"))
+		return;
+
+	skeletons.xdp_tx = xdp_tx__open_and_load();
+	if (!ASSERT_OK_PTR(skeletons.xdp_tx, "xdp_tx__open_and_load"))
+		goto destroy_xdp_dummy;
+
+	skeletons.xdp_redirect_maps = xdp_redirect_map__open_and_load();
+	if (!ASSERT_OK_PTR(skeletons.xdp_redirect_maps, "xdp_redirect_map__open_and_load"))
+		goto destroy_xdp_tx;
+
+	if (configure_network(&skeletons))
+		goto destroy_xdp_redirect_map;
+
+	ASSERT_OK(check_ping(&skeletons), "ping");
+
+destroy_xdp_redirect_map:
+	xdp_redirect_map__destroy(skeletons.xdp_redirect_maps);
+destroy_xdp_tx:
+	xdp_tx__destroy(skeletons.xdp_tx);
+destroy_xdp_dummy:
+	xdp_dummy__destroy(skeletons.xdp_dummy);
+
+	cleanup_network();
+}
diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh
deleted file mode 100755
index 5211ca9a0239..000000000000
--- a/tools/testing/selftests/bpf/test_xdp_veth.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# Create 3 namespaces with 3 veth peers, and
-# forward packets in-between using native XDP
-#
-#                      XDP_TX
-# NS1(veth11)        NS2(veth22)        NS3(veth33)
-#      |                  |                  |
-#      |                  |                  |
-#   (veth1,            (veth2,            (veth3,
-#   id:111)            id:122)            id:133)
-#     ^ |                ^ |                ^ |
-#     | |  XDP_REDIRECT  | |  XDP_REDIRECT  | |
-#     | ------------------ ------------------ |
-#     -----------------------------------------
-#                    XDP_REDIRECT
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-TESTNAME=xdp_veth
-BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
-BPF_DIR=$BPF_FS/test_$TESTNAME
-readonly NS1="ns1-$(mktemp -u XXXXXX)"
-readonly NS2="ns2-$(mktemp -u XXXXXX)"
-readonly NS3="ns3-$(mktemp -u XXXXXX)"
-
-_cleanup()
-{
-	set +e
-	ip link del veth1 2> /dev/null
-	ip link del veth2 2> /dev/null
-	ip link del veth3 2> /dev/null
-	ip netns del ${NS1} 2> /dev/null
-	ip netns del ${NS2} 2> /dev/null
-	ip netns del ${NS3} 2> /dev/null
-	rm -rf $BPF_DIR 2> /dev/null
-}
-
-cleanup_skip()
-{
-	echo "selftests: $TESTNAME [SKIP]"
-	_cleanup
-
-	exit $ksft_skip
-}
-
-cleanup()
-{
-	if [ "$?" = 0 ]; then
-		echo "selftests: $TESTNAME [PASS]"
-	else
-		echo "selftests: $TESTNAME [FAILED]"
-	fi
-	_cleanup
-}
-
-if [ $(id -u) -ne 0 ]; then
-	echo "selftests: $TESTNAME [SKIP] Need root privileges"
-	exit $ksft_skip
-fi
-
-if ! ip link set dev lo xdp off > /dev/null 2>&1; then
-	echo "selftests: $TESTNAME [SKIP] Could not run test without the ip xdp support"
-	exit $ksft_skip
-fi
-
-if [ -z "$BPF_FS" ]; then
-	echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted"
-	exit $ksft_skip
-fi
-
-if ! bpftool version > /dev/null 2>&1; then
-	echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool"
-	exit $ksft_skip
-fi
-
-set -e
-
-trap cleanup_skip EXIT
-
-ip netns add ${NS1}
-ip netns add ${NS2}
-ip netns add ${NS3}
-
-ip link add veth1 index 111 type veth peer name veth11 netns ${NS1}
-ip link add veth2 index 122 type veth peer name veth22 netns ${NS2}
-ip link add veth3 index 133 type veth peer name veth33 netns ${NS3}
-
-ip link set veth1 up
-ip link set veth2 up
-ip link set veth3 up
-
-ip -n ${NS1} addr add 10.1.1.11/24 dev veth11
-ip -n ${NS3} addr add 10.1.1.33/24 dev veth33
-
-ip -n ${NS1} link set dev veth11 up
-ip -n ${NS2} link set dev veth22 up
-ip -n ${NS3} link set dev veth33 up
-
-mkdir $BPF_DIR
-bpftool prog loadall \
-	xdp_redirect_map.bpf.o $BPF_DIR/progs type xdp \
-	pinmaps $BPF_DIR/maps
-bpftool map update pinned $BPF_DIR/maps/tx_port key 0 0 0 0 value 122 0 0 0
-bpftool map update pinned $BPF_DIR/maps/tx_port key 1 0 0 0 value 133 0 0 0
-bpftool map update pinned $BPF_DIR/maps/tx_port key 2 0 0 0 value 111 0 0 0
-ip link set dev veth1 xdp pinned $BPF_DIR/progs/xdp_redirect_map_0
-ip link set dev veth2 xdp pinned $BPF_DIR/progs/xdp_redirect_map_1
-ip link set dev veth3 xdp pinned $BPF_DIR/progs/xdp_redirect_map_2
-
-ip -n ${NS1} link set dev veth11 xdp obj xdp_dummy.bpf.o sec xdp
-ip -n ${NS2} link set dev veth22 xdp obj xdp_tx.bpf.o sec xdp
-ip -n ${NS3} link set dev veth33 xdp obj xdp_dummy.bpf.o sec xdp
-
-trap cleanup EXIT
-
-ip netns exec ${NS1} ping -c 1 -W 1 10.1.1.33
-
-exit 0
-- 
cgit v1.2.3


From ba71ffb660e4d41fe589f2459fb888ea61fdb310 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 24 Jul 2024 22:22:14 -0500
Subject: selftests/bpf: Load struct_ops map in global_maps_resize test

In prog_tests/test_global_maps_resize.c, we test various use cases for
resizing global maps. Commit 7244100e0389 ("libbpf: Don't take direct
pointers into BTF data from st_ops") updated libbpf to not store pointers
to volatile BTF data, which for some users, was causing a UAF when resizing
a datasec array.

Let's ensure we have coverage for resizing datasec arrays with struct_ops
progs by also including a struct_ops map and struct_ops prog in the
test_global_map_resize skeleton. The map is automatically loaded, so we
don't need to do anything other than add it to the BPF prog being tested
to get the coverage.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240725032214.50676-1-void@manifault.com
---
 .../testing/selftests/bpf/progs/test_global_map_resize.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c
index 1fbb73d3e5d5..714b29c7f8b2 100644
--- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c
+++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c
@@ -3,6 +3,7 @@
 
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
@@ -60,3 +61,18 @@ int data_array_sum(void *ctx)
 
 	return 0;
 }
+
+SEC("struct_ops/test_1")
+int BPF_PROG(test_1)
+{
+	return 0;
+}
+
+struct bpf_testmod_ops {
+	int (*test_1)(void);
+};
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops st_ops_resize = {
+	.test_1 = (void *)test_1
+};
-- 
cgit v1.2.3


From 781f0bbbdade00f91490a3f64212e8cc8d75905c Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Wed, 24 Jul 2024 04:11:20 -0700
Subject: tools/bpf: Fix the wrong format specifier

The format specifier of "unsigned int" in printf() should be "%u", not
"%d".

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240724111120.11625-1-zhujun2@cmss.chinamobile.com
---
 tools/bpf/bpftool/xlated_dumper.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
index 567f56dfd9f1..d0094345fb2b 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -349,7 +349,7 @@ void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len,
 
 		double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW);
 
-		printf("% 4d: ", i);
+		printf("%4u: ", i);
 		print_bpf_insn(&cbs, insn + i, true);
 
 		if (opcodes) {
@@ -415,7 +415,7 @@ void dump_xlated_for_graph(struct dump_data *dd, void *buf_start, void *buf_end,
 			}
 		}
 
-		printf("%d: ", insn_off);
+		printf("%u: ", insn_off);
 		print_bpf_insn(&cbs, cur, true);
 
 		if (opcodes) {
-- 
cgit v1.2.3


From c0247800ee7da5bc067b2916cf2722f755072307 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 29 Jul 2024 02:24:17 -0700
Subject: selftests/bpf: Use portable POSIX basename()

Use the POSIX version of basename() to allow compilation against non-gnu
libc (e.g. musl). Include <libgen.h> ahead of <string.h> to enable using
functions from the latter while preferring POSIX over GNU basename().

In veristat.c, rely on strdupa() to avoid basename() altering the passed
"const char" argument. This is not needed in xskxceiver.c since the arg
is mutable and the program exits immediately after usage.

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/0fd3c9f3c605e6cba33504213c9df287817ade04.1722244708.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/veristat.c   | 8 +++++---
 tools/testing/selftests/bpf/xskxceiver.c | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index b2854238d4a0..11ec1190d582 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
 #define _GNU_SOURCE
 #include <argp.h>
+#include <libgen.h>
 #include <string.h>
 #include <stdlib.h>
 #include <sched.h>
@@ -988,8 +989,8 @@ skip_freplace_fixup:
 
 static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog)
 {
+	const char *base_filename = basename(strdupa(filename));
 	const char *prog_name = bpf_program__name(prog);
-	const char *base_filename = basename(filename);
 	char *buf;
 	int buf_sz, log_level;
 	struct verif_stats *stats;
@@ -1056,13 +1057,14 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 
 static int process_obj(const char *filename)
 {
+	const char *base_filename = basename(strdupa(filename));
 	struct bpf_object *obj = NULL, *tobj;
 	struct bpf_program *prog, *tprog, *lprog;
 	libbpf_print_fn_t old_libbpf_print_fn;
 	LIBBPF_OPTS(bpf_object_open_opts, opts);
 	int err = 0, prog_cnt = 0;
 
-	if (!should_process_file_prog(basename(filename), NULL)) {
+	if (!should_process_file_prog(base_filename, NULL)) {
 		if (env.verbose)
 			printf("Skipping '%s' due to filters...\n", filename);
 		env.files_skipped++;
@@ -1076,7 +1078,7 @@ static int process_obj(const char *filename)
 	}
 
 	if (!env.quiet && env.out_fmt == RESFMT_TABLE)
-		printf("Processing '%s'...\n", basename(filename));
+		printf("Processing '%s'...\n", base_filename);
 
 	old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn);
 	obj = bpf_object__open_file(filename, &opts);
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 8144fd145237..92af633faea8 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -90,6 +90,7 @@
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <libgen.h>
 #include <string.h>
 #include <stddef.h>
 #include <sys/mman.h>
-- 
cgit v1.2.3


From 03bfcda1fbc37ef34aa21d2b9e09138335afc6ee Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 29 Jul 2024 02:24:18 -0700
Subject: selftests/bpf: Fix arg parsing in veristat, test_progs

Current code parses arguments with strtok_r() using a construct like

    char *state = NULL;
    while ((next = strtok_r(state ? NULL : input, ",", &state))) {
        ...
    }

where logic assumes the 'state' var can distinguish between first and
subsequent strtok_r() calls, and adjusts parameters accordingly. However,
'state' is strictly internal context for strtok_r() and no such assumptions
are supported in the man page. Moreover, the exact behaviour of 'state'
depends on the libc implementation, making the above code fragile.

Indeed, invoking "./test_progs -t <test_name>" on mips64el/musl will hang,
with the above code in an infinite loop.

Similarly, we see strange behaviour running 'veristat' on mips64el/musl:

    $ ./veristat -e file,prog,verdict,insns -C two-ok add-failure
    Can't specify more than 9 stats

Rewrite code using a counter to distinguish between strtok_r() calls.

Fixes: 61ddff373ffa ("selftests/bpf: Improve by-name subtest selection logic in prog_tests")
Fixes: 394169b079b5 ("selftests/bpf: add comparison mode to veristat")
Fixes: c8bc5e050976 ("selftests/bpf: Add veristat tool for mass-verifying BPF object files")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/392d8bf5559f85fa37926c1494e62312ef252c3d.1722244708.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/testing_helpers.c | 4 ++--
 tools/testing/selftests/bpf/veristat.c        | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index ac7c66f4fc7b..c217e12bd9da 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -221,13 +221,13 @@ int parse_test_list(const char *s,
 		    bool is_glob_pattern)
 {
 	char *input, *state = NULL, *test_spec;
-	int err = 0;
+	int err = 0, cnt = 0;
 
 	input = strdup(s);
 	if (!input)
 		return -ENOMEM;
 
-	while ((test_spec = strtok_r(state ? NULL : input, ",", &state))) {
+	while ((test_spec = strtok_r(cnt++ ? NULL : input, ",", &state))) {
 		err = insert_test(set, test_spec, is_glob_pattern);
 		if (err)
 			break;
diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 11ec1190d582..1ec5c4c47235 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -785,13 +785,13 @@ static int parse_stat(const char *stat_name, struct stat_specs *specs)
 static int parse_stats(const char *stats_str, struct stat_specs *specs)
 {
 	char *input, *state = NULL, *next;
-	int err;
+	int err, cnt = 0;
 
 	input = strdup(stats_str);
 	if (!input)
 		return -ENOMEM;
 
-	while ((next = strtok_r(state ? NULL : input, ",", &state))) {
+	while ((next = strtok_r(cnt++ ? NULL : input, ",", &state))) {
 		err = parse_stat(next, specs);
 		if (err) {
 			free(input);
@@ -1495,7 +1495,7 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs,
 	while (fgets(line, sizeof(line), f)) {
 		char *input = line, *state = NULL, *next;
 		struct verif_stats *st = NULL;
-		int col = 0;
+		int col = 0, cnt = 0;
 
 		if (!header) {
 			void *tmp;
@@ -1513,7 +1513,7 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs,
 			*stat_cntp += 1;
 		}
 
-		while ((next = strtok_r(state ? NULL : input, ",\n", &state))) {
+		while ((next = strtok_r(cnt++ ? NULL : input, ",\n", &state))) {
 			if (header) {
 				/* for the first line, set up spec stats */
 				err = parse_stat(next, specs);
-- 
cgit v1.2.3


From cacf2a5a78cd1f5f616eae043ebc6f024104b721 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 29 Jul 2024 02:24:19 -0700
Subject: selftests/bpf: Fix error compiling test_lru_map.c

Although the post-increment in macro 'CPU_SET(next++, &cpuset)' seems safe,
the sequencing can raise compile errors, so move the increment outside the
macro. This avoids an error seen using gcc 12.3.0 for mips64el/musl-libc:

  In file included from test_lru_map.c:11:
  test_lru_map.c: In function 'sched_next_online':
  test_lru_map.c:129:29: error: operation on 'next' may be undefined [-Werror=sequence-point]
    129 |                 CPU_SET(next++, &cpuset);
        |                             ^
  cc1: all warnings being treated as errors

Fixes: 3fbfadce6012 ("bpf: Fix test_lru_sanity5() in test_lru_map.c")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/22993dfb11ccf27925a626b32672fd3324cb76c4.1722244708.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/test_lru_map.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index 4d0650cfb5cd..fda7589c5023 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -126,7 +126,8 @@ static int sched_next_online(int pid, int *next_to_try)
 
 	while (next < nr_cpus) {
 		CPU_ZERO(&cpuset);
-		CPU_SET(next++, &cpuset);
+		CPU_SET(next, &cpuset);
+		next++;
 		if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset)) {
 			ret = 0;
 			break;
-- 
cgit v1.2.3


From aa95073fd290b5b3e45f067fa22bb25e59e1ff7c Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 29 Jul 2024 02:24:20 -0700
Subject: selftests/bpf: Fix C++ compile error from missing _Bool type

While building, bpftool makes a skeleton from test_core_extern.c, which
itself includes <stdbool.h> and uses the 'bool' type. However, the skeleton
test_core_extern.skel.h generated *does not* include <stdbool.h> or use the
'bool' type, instead using the C-only '_Bool' type. Compiling test_cpp.cpp
with g++ 12.3 for mips64el/musl-libc then fails with error:

  In file included from test_cpp.cpp:9:
  test_core_extern.skel.h:45:17: error: '_Bool' does not name a type
     45 |                 _Bool CONFIG_BOOL;
        |                 ^~~~~

This was likely missed previously because glibc uses a GNU extension for
<stdbool.h> with C++ (#define _Bool bool), not supported by musl libc.

Normally, a C fragment would include <stdbool.h> and use the 'bool' type,
and thus cleanly work after import by C++. The ideal fix would be for
'bpftool gen skeleton' to output the correct type/include supporting C++,
but in the meantime add a conditional define as above.

Fixes: 7c8dce4b1661 ("bpftool: Make skeleton C code compilable with C++ compiler")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/6fc1dd28b8bda49e51e4f610bdc9d22f4455632d.1722244708.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/test_cpp.cpp | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp
index dde0bb16e782..abc2a56ab261 100644
--- a/tools/testing/selftests/bpf/test_cpp.cpp
+++ b/tools/testing/selftests/bpf/test_cpp.cpp
@@ -6,6 +6,10 @@
 #include <bpf/libbpf.h>
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
+
+#ifndef _Bool
+#define _Bool bool
+#endif
 #include "test_core_extern.skel.h"
 #include "struct_ops_module.skel.h"
 
-- 
cgit v1.2.3


From 16b795cc59528cf280abc79af3c70bda42f715b9 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 29 Jul 2024 02:24:21 -0700
Subject: selftests/bpf: Fix redefinition errors compiling lwt_reroute.c

Compiling lwt_reroute.c with GCC 12.3 for mips64el/musl-libc yields errors:

In file included from .../include/arpa/inet.h:9,
                 from ./test_progs.h:18,
                 from tools/testing/selftests/bpf/prog_tests/lwt_helpers.h:11,
                 from tools/testing/selftests/bpf/prog_tests/lwt_reroute.c:52:
.../include/netinet/in.h:23:8: error: redefinition of 'struct in6_addr'
   23 | struct in6_addr {
      |        ^~~~~~~~
In file included from .../include/linux/icmp.h:24,
                 from tools/testing/selftests/bpf/prog_tests/lwt_helpers.h:9:
.../include/linux/in6.h:33:8: note: originally defined here
   33 | struct in6_addr {
      |        ^~~~~~~~
.../include/netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6'
   34 | struct sockaddr_in6 {
      |        ^~~~~~~~~~~~
.../include/linux/in6.h:50:8: note: originally defined here
   50 | struct sockaddr_in6 {
      |        ^~~~~~~~~~~~
.../include/netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq'
   42 | struct ipv6_mreq {
      |        ^~~~~~~~~
.../include/linux/in6.h:60:8: note: originally defined here
   60 | struct ipv6_mreq {
      |        ^~~~~~~~~

These errors occur because <linux/in6.h> is included before <netinet/in.h>,
bypassing the Linux uapi/libc compat mechanism's partial musl support. As
described in [1] and [2], fix these errors by including <netinet/in.h> in
lwt_reroute.c before any uapi headers.

[1]: commit c0bace798436 ("uapi libc compat: add fallback for unsupported libcs")
[2]: https://git.musl-libc.org/cgit/musl/commit/?id=04983f227238

Fixes: 6c77997bc639 ("selftests/bpf: Add lwt_xmit tests for BPF_REROUTE")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/bd2908aec0755ba8b75f5dc41848b00585f5c73e.1722244708.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/lwt_reroute.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c
index 03825d2b45a8..6c50c0f63f43 100644
--- a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c
+++ b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c
@@ -49,6 +49,7 @@
  *  is not crashed, it is considered successful.
  */
 #define NETNS "ns_lwt_reroute"
+#include <netinet/in.h>
 #include "lwt_helpers.h"
 #include "network_helpers.h"
 #include <linux/net_tstamp.h>
-- 
cgit v1.2.3


From c9a83e76b5a96801a2c7ea0a79ca77c356d8b38d Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 29 Jul 2024 02:24:22 -0700
Subject: selftests/bpf: Fix compile if backtrace support missing in libc

Include GNU <execinfo.h> header only with glibc and provide weak, stubbed
backtrace functions as a fallback in test_progs.c. This allows for non-GNU
replacements while avoiding compile errors (e.g. with musl libc) like:

  test_progs.c:13:10: fatal error: execinfo.h: No such file or directory
     13 | #include <execinfo.h> /* backtrace */
        |          ^~~~~~~~~~~~
  test_progs.c: In function 'crash_handler':
  test_progs.c:1034:14: error: implicit declaration of function 'backtrace' [-Werror=implicit-function-declaration]
   1034 |         sz = backtrace(bt, ARRAY_SIZE(bt));
        |              ^~~~~~~~~
  test_progs.c:1045:9: error: implicit declaration of function 'backtrace_symbols_fd' [-Werror=implicit-function-declaration]
   1045 |         backtrace_symbols_fd(bt, sz, STDERR_FILENO);
        |         ^~~~~~~~~~~~~~~~~~~~

Fixes: 9fb156bb82a3 ("selftests/bpf: Print backtrace on SIGSEGV in test_progs")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/aa6dc8e23710cb457b278039d0081de7e7b4847d.1722244708.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/test_progs.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 60c5ec0f6abf..d5d0cb4eb197 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -10,7 +10,6 @@
 #include <sched.h>
 #include <signal.h>
 #include <string.h>
-#include <execinfo.h> /* backtrace */
 #include <sys/sysinfo.h> /* get_nprocs */
 #include <netinet/in.h>
 #include <sys/select.h>
@@ -19,6 +18,21 @@
 #include <bpf/btf.h>
 #include "json_writer.h"
 
+#ifdef __GLIBC__
+#include <execinfo.h> /* backtrace */
+#endif
+
+/* Default backtrace funcs if missing at link */
+__weak int backtrace(void **buffer, int size)
+{
+	return 0;
+}
+
+__weak void backtrace_symbols_fd(void *const *buffer, int size, int fd)
+{
+	dprintf(fd, "<backtrace not supported>\n");
+}
+
 static bool verbose(void)
 {
 	return env.verbosity > VERBOSE_NONE;
-- 
cgit v1.2.3


From 06eeca1217a8655975f340e0ead6a396df74a2a4 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 29 Jul 2024 02:24:23 -0700
Subject: selftests/bpf: Fix using stdout, stderr as struct field names

Typically stdin, stdout, stderr are treated as reserved identifiers under
ISO/ANSI C and libc implementations further define these as macros, both in
glibc and musl <stdio.h>.

However, while glibc defines:
    ...
    /* Standard streams.  */
    extern FILE *stdin;             /* Standard input stream.  */
    extern FILE *stdout;            /* Standard output stream.  */
    extern FILE *stderr;            /* Standard error output stream.  */
    /* C89/C99 say they're macros.  Make them happy.  */
    #define stdin stdin
    #define stdout stdout
    #define stderr stderr
    ...

musl instead uses (legally):
    ...
    extern FILE *const stdin;
    extern FILE *const stdout;
    extern FILE *const stderr;

    #define stdin  (stdin)
    #define stdout (stdout)
    #define stderr (stderr)
    ...

The latter results in compile errors when the names are reused as fields of
'struct test_env' and elsewhere in test_progs.[ch] and reg_bounds.c.

Rename the fields to stdout_saved and stderr_saved to avoid many errors
seen building against musl, e.g.:

  In file included from test_progs.h:6,
                   from test_progs.c:5:
  test_progs.c: In function 'print_test_result':
  test_progs.c:237:21: error: expected identifier before '(' token
    237 |         fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name);
        |                     ^~~~~~
  test_progs.c:237:9: error: too few arguments to function 'fprintf'
    237 |         fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name);
        |         ^~~~~~~

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/ZqR2DuHdBXPX%2Fyx8@kodidev-ubuntu/
Link: https://lore.kernel.org/bpf/684ea17548e237f39dfb3f7a3d33450069015b21.1722244708.git.tony.ambardar@gmail.com
---
 .../testing/selftests/bpf/prog_tests/reg_bounds.c  |  2 +-
 tools/testing/selftests/bpf/test_progs.c           | 66 +++++++++++-----------
 tools/testing/selftests/bpf/test_progs.h           |  8 +--
 3 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
index 0da4225749bd..467027236d30 100644
--- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
+++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
@@ -1487,7 +1487,7 @@ static int verify_case_opt(struct ctx *ctx, enum num_t init_t, enum num_t cond_t
 			u64 elapsed_ns = get_time_ns() - ctx->start_ns;
 			double remain_ns = elapsed_ns / progress * (1 - progress);
 
-			fprintf(env.stderr, "PROGRESS (%s): %d/%d (%.2lf%%), "
+			fprintf(env.stderr_saved, "PROGRESS (%s): %d/%d (%.2lf%%), "
 					    "elapsed %llu mins (%.2lf hrs), "
 					    "ETA %.0lf mins (%.2lf hrs)\n",
 				ctx->progress_ctx,
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index d5d0cb4eb197..60fafa2f1ed7 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -51,15 +51,15 @@ static void stdio_hijack_init(char **log_buf, size_t *log_cnt)
 
 	stdout = open_memstream(log_buf, log_cnt);
 	if (!stdout) {
-		stdout = env.stdout;
+		stdout = env.stdout_saved;
 		perror("open_memstream");
 		return;
 	}
 
 	if (env.subtest_state)
-		env.subtest_state->stdout = stdout;
+		env.subtest_state->stdout_saved = stdout;
 	else
-		env.test_state->stdout = stdout;
+		env.test_state->stdout_saved = stdout;
 
 	stderr = stdout;
 #endif
@@ -73,8 +73,8 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt)
 		return;
 	}
 
-	env.stdout = stdout;
-	env.stderr = stderr;
+	env.stdout_saved = stdout;
+	env.stderr_saved = stderr;
 
 	stdio_hijack_init(log_buf, log_cnt);
 #endif
@@ -91,13 +91,13 @@ static void stdio_restore_cleanup(void)
 	fflush(stdout);
 
 	if (env.subtest_state) {
-		fclose(env.subtest_state->stdout);
-		env.subtest_state->stdout = NULL;
-		stdout = env.test_state->stdout;
-		stderr = env.test_state->stdout;
+		fclose(env.subtest_state->stdout_saved);
+		env.subtest_state->stdout_saved = NULL;
+		stdout = env.test_state->stdout_saved;
+		stderr = env.test_state->stdout_saved;
 	} else {
-		fclose(env.test_state->stdout);
-		env.test_state->stdout = NULL;
+		fclose(env.test_state->stdout_saved);
+		env.test_state->stdout_saved = NULL;
 	}
 #endif
 }
@@ -110,13 +110,13 @@ static void stdio_restore(void)
 		return;
 	}
 
-	if (stdout == env.stdout)
+	if (stdout == env.stdout_saved)
 		return;
 
 	stdio_restore_cleanup();
 
-	stdout = env.stdout;
-	stderr = env.stderr;
+	stdout = env.stdout_saved;
+	stderr = env.stderr_saved;
 #endif
 }
 
@@ -244,25 +244,25 @@ static void print_test_result(const struct prog_test_def *test, const struct tes
 	int skipped_cnt = test_state->skip_cnt;
 	int subtests_cnt = test_state->subtest_num;
 
-	fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name);
+	fprintf(env.stdout_saved, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name);
 	if (test_state->error_cnt)
-		fprintf(env.stdout, "FAIL");
+		fprintf(env.stdout_saved, "FAIL");
 	else if (!skipped_cnt)
-		fprintf(env.stdout, "OK");
+		fprintf(env.stdout_saved, "OK");
 	else if (skipped_cnt == subtests_cnt || !subtests_cnt)
-		fprintf(env.stdout, "SKIP");
+		fprintf(env.stdout_saved, "SKIP");
 	else
-		fprintf(env.stdout, "OK (SKIP: %d/%d)", skipped_cnt, subtests_cnt);
+		fprintf(env.stdout_saved, "OK (SKIP: %d/%d)", skipped_cnt, subtests_cnt);
 
-	fprintf(env.stdout, "\n");
+	fprintf(env.stdout_saved, "\n");
 }
 
 static void print_test_log(char *log_buf, size_t log_cnt)
 {
 	log_buf[log_cnt] = '\0';
-	fprintf(env.stdout, "%s", log_buf);
+	fprintf(env.stdout_saved, "%s", log_buf);
 	if (log_buf[log_cnt - 1] != '\n')
-		fprintf(env.stdout, "\n");
+		fprintf(env.stdout_saved, "\n");
 }
 
 static void print_subtest_name(int test_num, int subtest_num,
@@ -273,14 +273,14 @@ static void print_subtest_name(int test_num, int subtest_num,
 
 	snprintf(test_num_str, sizeof(test_num_str), "%d/%d", test_num, subtest_num);
 
-	fprintf(env.stdout, "#%-*s %s/%s",
+	fprintf(env.stdout_saved, "#%-*s %s/%s",
 		TEST_NUM_WIDTH, test_num_str,
 		test_name, subtest_name);
 
 	if (result)
-		fprintf(env.stdout, ":%s", result);
+		fprintf(env.stdout_saved, ":%s", result);
 
-	fprintf(env.stdout, "\n");
+	fprintf(env.stdout_saved, "\n");
 }
 
 static void jsonw_write_log_message(json_writer_t *w, char *log_buf, size_t log_cnt)
@@ -465,7 +465,7 @@ bool test__start_subtest(const char *subtest_name)
 	memset(subtest_state, 0, sub_state_size);
 
 	if (!subtest_name || !subtest_name[0]) {
-		fprintf(env.stderr,
+		fprintf(env.stderr_saved,
 			"Subtest #%d didn't provide sub-test name!\n",
 			state->subtest_num);
 		return false;
@@ -473,7 +473,7 @@ bool test__start_subtest(const char *subtest_name)
 
 	subtest_state->name = strdup(subtest_name);
 	if (!subtest_state->name) {
-		fprintf(env.stderr,
+		fprintf(env.stderr_saved,
 			"Subtest #%d: failed to copy subtest name!\n",
 			state->subtest_num);
 		return false;
@@ -1043,7 +1043,7 @@ void crash_handler(int signum)
 
 	sz = backtrace(bt, ARRAY_SIZE(bt));
 
-	if (env.stdout)
+	if (env.stdout_saved)
 		stdio_restore();
 	if (env.test) {
 		env.test_state->error_cnt++;
@@ -1359,7 +1359,7 @@ static void calculate_summary_and_print_errors(struct test_env *env)
 	if (env->json) {
 		w = jsonw_new(env->json);
 		if (!w)
-			fprintf(env->stderr, "Failed to create new JSON stream.");
+			fprintf(env->stderr_saved, "Failed to create new JSON stream.");
 	}
 
 	if (w) {
@@ -1708,8 +1708,8 @@ int main(int argc, char **argv)
 		return -1;
 	}
 
-	env.stdout = stdout;
-	env.stderr = stderr;
+	env.stdout_saved = stdout;
+	env.stderr_saved = stderr;
 
 	env.has_testmod = true;
 	if (!env.list_test_names) {
@@ -1717,7 +1717,7 @@ int main(int argc, char **argv)
 		unload_bpf_testmod(verbose());
 
 		if (load_bpf_testmod(verbose())) {
-			fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n");
+			fprintf(env.stderr_saved, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n");
 			env.has_testmod = false;
 		}
 	}
@@ -1795,7 +1795,7 @@ int main(int argc, char **argv)
 		}
 
 		if (env.list_test_names) {
-			fprintf(env.stdout, "%s\n", test->test_name);
+			fprintf(env.stdout_saved, "%s\n", test->test_name);
 			env.succ_cnt++;
 			continue;
 		}
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index b1e949fb16cf..cb9d6d46826b 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -75,7 +75,7 @@ struct subtest_state {
 	bool skipped;
 	bool filtered;
 
-	FILE *stdout;
+	FILE *stdout_saved;
 };
 
 struct test_state {
@@ -92,7 +92,7 @@ struct test_state {
 	size_t log_cnt;
 	char *log_buf;
 
-	FILE *stdout;
+	FILE *stdout_saved;
 };
 
 struct test_env {
@@ -111,8 +111,8 @@ struct test_env {
 	struct test_state *test_state; /* current running test state */
 	struct subtest_state *subtest_state; /* current running subtest state */
 
-	FILE *stdout;
-	FILE *stderr;
+	FILE *stdout_saved;
+	FILE *stderr_saved;
 	int nr_cpus;
 	FILE *json;
 
-- 
cgit v1.2.3


From 21c5f4f55da759c7444a1ef13e90b6e6f674eeeb Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 29 Jul 2024 02:24:24 -0700
Subject: selftests/bpf: Fix error compiling tc_redirect.c with musl libc

Linux 5.1 implemented 64-bit time types and related syscalls to address the
Y2038 problem generally across archs. Userspace handling of Y2038 varies
with the libc however. While musl libc uses 64-bit time across all 32-bit
and 64-bit platforms, GNU glibc uses 64-bit time on 64-bit platforms but
defaults to 32-bit time on 32-bit platforms unless they "opt-in" to 64-bit
time or explicitly use 64-bit syscalls and time structures.

One specific area is the standard setsockopt() call, SO_TIMESTAMPNS option
used for timestamping, and the related output 'struct timespec'. GNU glibc
defaults as above, also exposing the SO_TIMESTAMPNS_NEW flag to explicitly
use a 64-bit call and 'struct __kernel_timespec'. Since these are not
exposed or needed with musl libc, their use in tc_redirect.c leads to
compile errors building for mips64el/musl:

  tc_redirect.c: In function 'rcv_tstamp':
  tc_redirect.c:425:32: error: 'SO_TIMESTAMPNS_NEW' undeclared (first use in this function); did you mean 'SO_TIMESTAMPNS'?
    425 |             cmsg->cmsg_type == SO_TIMESTAMPNS_NEW)
        |                                ^~~~~~~~~~~~~~~~~~
        |                                SO_TIMESTAMPNS
  tc_redirect.c:425:32: note: each undeclared identifier is reported only once for each function it appears in
  tc_redirect.c: In function 'test_inet_dtime':
  tc_redirect.c:491:49: error: 'SO_TIMESTAMPNS_NEW' undeclared (first use in this function); did you mean 'SO_TIMESTAMPNS'?
    491 |         err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
        |                                                 ^~~~~~~~~~~~~~~~~~
        |                                                 SO_TIMESTAMPNS

However, using SO_TIMESTAMPNS_NEW isn't strictly needed, nor is Y2038 being
explicitly tested. The timestamp checks in tc_redirect.c are simple: the
packet receive timestamp is non-zero and processed/handled in less than 5
seconds.

Switch to using the standard setsockopt() call and SO_TIMESTAMPNS option to
ensure compatibility across glibc and musl libc. In the worst-case, there
is a 5-second window 14 years from now where tc_redirect tests may fail on
32-bit systems. However, we should reasonably expect glibc to adopt a
64-bit mandate rather than the current "opt-in" policy before the Y2038
roll-over.

Fixes: ce6f6cffaeaa ("selftests/bpf: Wait for the netstamp_needed_key static key to be turned on")
Fixes: c803475fd8dd ("bpf: selftests: test skb->tstamp in redirect_neigh")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/031d656c058b4e55ceae56ef49c4e1729b5090f3.1722244708.git.tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 327d51f59142..53b8ffc943dc 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -471,7 +471,7 @@ static int set_forwarding(bool enable)
 
 static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp)
 {
-	struct __kernel_timespec pkt_ts = {};
+	struct timespec pkt_ts = {};
 	char ctl[CMSG_SPACE(sizeof(pkt_ts))];
 	struct timespec now_ts;
 	struct msghdr msg = {};
@@ -495,7 +495,7 @@ static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp)
 
 	cmsg = CMSG_FIRSTHDR(&msg);
 	if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
-	    cmsg->cmsg_type == SO_TIMESTAMPNS_NEW)
+	    cmsg->cmsg_type == SO_TIMESTAMPNS)
 		memcpy(&pkt_ts, CMSG_DATA(cmsg), sizeof(pkt_ts));
 
 	pkt_ns = pkt_ts.tv_sec * NSEC_PER_SEC + pkt_ts.tv_nsec;
@@ -537,9 +537,9 @@ static int wait_netstamp_needed_key(void)
 	if (!ASSERT_GE(srv_fd, 0, "start_server"))
 		goto done;
 
-	err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+	err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS,
 			 &opt, sizeof(opt));
-	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)"))
+	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)"))
 		goto done;
 
 	cli_fd = connect_to_fd(srv_fd, TIMEOUT_MILLIS);
@@ -621,9 +621,9 @@ static void test_inet_dtime(int family, int type, const char *addr, __u16 port)
 		return;
 
 	/* Ensure the kernel puts the (rcv) timestamp for all skb */
-	err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+	err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS,
 			 &opt, sizeof(opt));
-	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)"))
+	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)"))
 		goto done;
 
 	if (type == SOCK_STREAM) {
-- 
cgit v1.2.3


From 92cc2456e9775dc4333fb4aa430763ae4ac2f2d9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 30 Jul 2024 16:18:05 -0700
Subject: selftests/bpf: fix RELEASE=1 compilation for sock_addr.c

When building selftests with RELEASE=1 using GCC compiler, it complaints
about uninitialized err. Fix the problem.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240730231805.1933923-1-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sock_addr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c
index b880c564a204..a6ee7f8d4f79 100644
--- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c
+++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c
@@ -2642,6 +2642,7 @@ void test_sock_addr(void)
 			break;
 		default:
 			ASSERT_TRUE(false, "Unknown sock addr test type");
+			err = -EINVAL;
 			break;
 		}
 
-- 
cgit v1.2.3


From ac93ca125b5409df56c5139648bbe10fd1ea989b Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Tue, 23 Jul 2024 19:46:36 -0700
Subject: tools: gpio: Fix the wrong format specifier

The unsigned int should use "%u" instead of "%d".

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Link: https://lore.kernel.org/r/20240724024636.3634-1-zhujun2@cmss.chinamobile.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 tools/gpio/gpio-hammer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/gpio/gpio-hammer.c b/tools/gpio/gpio-hammer.c
index 54fdf59dd320..ba0866eb3581 100644
--- a/tools/gpio/gpio-hammer.c
+++ b/tools/gpio/gpio-hammer.c
@@ -54,7 +54,7 @@ int hammer_device(const char *device_name, unsigned int *lines, int num_lines,
 
 	fprintf(stdout, "Hammer lines [");
 	for (i = 0; i < num_lines; i++) {
-		fprintf(stdout, "%d", lines[i]);
+		fprintf(stdout, "%u", lines[i]);
 		if (i != (num_lines - 1))
 			fprintf(stdout, ", ");
 	}
@@ -89,7 +89,7 @@ int hammer_device(const char *device_name, unsigned int *lines, int num_lines,
 
 		fprintf(stdout, "[");
 		for (i = 0; i < num_lines; i++) {
-			fprintf(stdout, "%d: %d", lines[i],
+			fprintf(stdout, "%u: %d", lines[i],
 				gpiotools_test_bit(values.bits, i));
 			if (i != (num_lines - 1))
 				fprintf(stdout, ", ");
-- 
cgit v1.2.3


From 30b968b002a92870325a5c9d1ce78eba0ce386e7 Mon Sep 17 00:00:00 2001
From: Yu-Ting Tseng <yutingtseng@google.com>
Date: Tue, 9 Jul 2024 00:00:49 -0700
Subject: binder: frozen notification binder_features flag

Add a flag to binder_features to indicate that the freeze notification
feature is available.

Signed-off-by: Yu-Ting Tseng <yutingtseng@google.com>
Acked-by: Carlos Llamas <cmllamas@google.com>
Link: https://lore.kernel.org/r/20240709070047.4055369-6-yutingtseng@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binderfs.c                                   | 8 ++++++++
 tools/testing/selftests/filesystems/binderfs/binderfs_test.c | 1 +
 2 files changed, 9 insertions(+)

(limited to 'tools')

diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
index 3001d754ac36..ad1fa7abc323 100644
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -58,6 +58,7 @@ enum binderfs_stats_mode {
 struct binder_features {
 	bool oneway_spam_detection;
 	bool extended_error;
+	bool freeze_notification;
 };
 
 static const struct constant_table binderfs_param_stats[] = {
@@ -74,6 +75,7 @@ static const struct fs_parameter_spec binderfs_fs_parameters[] = {
 static struct binder_features binder_features = {
 	.oneway_spam_detection = true,
 	.extended_error = true,
+	.freeze_notification = true,
 };
 
 static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb)
@@ -608,6 +610,12 @@ static int init_binder_features(struct super_block *sb)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
+	dentry = binderfs_create_file(dir, "freeze_notification",
+				      &binder_features_fops,
+				      &binder_features.freeze_notification);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
 	return 0;
 }
 
diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
index 5f362c0fd890..319567f0fae1 100644
--- a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
+++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
@@ -65,6 +65,7 @@ static int __do_binderfs_test(struct __test_metadata *_metadata)
 	static const char * const binder_features[] = {
 		"oneway_spam_detection",
 		"extended_error",
+		"freeze_notification",
 	};
 
 	change_mountns(_metadata);
-- 
cgit v1.2.3


From 45a0c928e7aa42caf2380b134bcd326b40a9df84 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Tue, 25 Jun 2024 02:13:39 +0800
Subject: perf trace: BTF-based enum pretty printing for syscall args
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In this patch, BTF is used to turn enum value to the corresponding
name. There is only one system call that uses enum value as its
argument, that is `landlock_add_rule()`.

The vmlinux btf is loaded lazily, when user decided to trace the
`landlock_add_rule` syscall. But if one decide to run `perf trace`
without any arguments, the behaviour is to trace `landlock_add_rule`,
so vmlinux btf will be loaded by default.

The laziest behaviour is to load vmlinux btf when a
`landlock_add_rule` syscall hits. But I think you could lose some
samples when loading vmlinux btf at run time, for it can delay the
handling of other samples. I might need your precious opinions on
this...

before:

```
perf $ ./perf trace -e landlock_add_rule
     0.000 ( 0.008 ms): ldlck-test/438194 landlock_add_rule(rule_type: 2) = -1 EBADFD (File descriptor in bad state)
     0.010 ( 0.001 ms): ldlck-test/438194 landlock_add_rule(rule_type: 1) = -1 EBADFD (File descriptor in bad state)
```

after:

```
perf $ ./perf trace -e landlock_add_rule
     0.000 ( 0.029 ms): ldlck-test/438194 landlock_add_rule(rule_type: LANDLOCK_RULE_NET_PORT)     = -1 EBADFD (File descriptor in bad state)
     0.036 ( 0.004 ms): ldlck-test/438194 landlock_add_rule(rule_type: LANDLOCK_RULE_PATH_BENEATH) = -1 EBADFD (File descriptor in bad state)
```

Committer notes:

Made it build with NO_LIBBPF=1, simplified btf_enum_fprintf(), see [1]
for the discussion.

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Günther Noack <gnoack@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mickaël Salaün <mic@digikod.net>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/20240613022757.3589783-1-howardchu95@gmail.com
Link: https://lore.kernel.org/lkml/ZnXAhFflUl_LV1QY@x1 # [1]
Link: https://lore.kernel.org/r/20240624181345.124764-3-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 110 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 106 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 8449f2beb54d..1391564911d9 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -19,6 +19,7 @@
 #ifdef HAVE_LIBBPF_SUPPORT
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
+#include <bpf/btf.h>
 #ifdef HAVE_BPF_SKEL
 #include "bpf_skel/augmented_raw_syscalls.skel.h"
 #endif
@@ -110,6 +111,10 @@ struct syscall_arg_fmt {
 	const char *name;
 	u16	   nr_entries; // for arrays
 	bool	   show_zero;
+	bool	   is_enum;
+#ifdef HAVE_LIBBPF_SUPPORT
+	const struct btf_type *type;
+#endif
 };
 
 struct syscall_fmt {
@@ -139,6 +144,9 @@ struct trace {
 	} syscalls;
 #ifdef HAVE_BPF_SKEL
 	struct augmented_raw_syscalls_bpf *skel;
+#endif
+#ifdef HAVE_LIBBPF_SUPPORT
+	struct btf		*btf;
 #endif
 	struct record_opts	opts;
 	struct evlist	*evlist;
@@ -204,6 +212,20 @@ struct trace {
 	} oe;
 };
 
+static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused)
+{
+#ifdef HAVE_LIBBPF_SUPPORT
+	if (trace->btf != NULL)
+		return;
+
+	trace->btf = btf__load_vmlinux_btf();
+	if (verbose > 0) {
+		fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" :
+						    "Failed to load vmlinux BTF\n");
+	}
+#endif
+}
+
 struct tp_field {
 	int offset;
 	union {
@@ -887,6 +909,64 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 
 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 
+#ifdef HAVE_LIBBPF_SUPPORT
+static int syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
+{
+	int id;
+
+	// Already cached?
+	if (arg_fmt->type != NULL)
+		return 0;
+
+	type = strstr(type, "enum ");
+	if (type == NULL)
+		return -1;
+
+	type += 5; // skip "enum " to get the enumeration name
+
+	id = btf__find_by_name(btf, type);
+	if (id < 0)
+		return -1;
+
+	arg_fmt->type = btf__type_by_id(btf, id);
+	return arg_fmt->type == NULL ? -1 : 0;
+}
+
+static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val)
+{
+	struct btf_enum *be = btf_enum(type);
+	const int nr_entries = btf_vlen(type);
+
+	for (int i = 0; i < nr_entries; ++i, ++be) {
+		if (be->val == val) {
+			return scnprintf(bf, size, "%s",
+					 btf__name_by_offset(btf, be->name_off));
+		}
+	}
+
+	return 0;
+}
+
+static size_t trace__btf_enum_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf,
+					size_t size, int val, char *type)
+{
+	if (trace->btf == NULL)
+		return 0;
+
+	if (syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type) < 0)
+		return 0;
+
+	return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
+}
+#else // HAVE_LIBBPF_SUPPORT
+static size_t trace__btf_enum_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg_fmt *arg_fmt __maybe_unused,
+					char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
+					char *type __maybe_unused)
+{
+	return 0;
+}
+#endif // HAVE_LIBBPF_SUPPORT
+
 #define STRARRAY(name, array) \
 	  { .scnprintf	= SCA_STRARRAY, \
 	    .strtoul	= STUL_STRARRAY, \
@@ -1238,6 +1318,7 @@ struct syscall {
 	bool		    is_exit;
 	bool		    is_open;
 	bool		    nonexistent;
+	bool		    use_btf;
 	struct tep_format_field *args;
 	const char	    *name;
 	const struct syscall_fmt  *fmt;
@@ -1744,7 +1825,8 @@ static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *n
 }
 
 static struct tep_format_field *
-syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field)
+syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
+			    bool *use_btf)
 {
 	struct tep_format_field *last_field = NULL;
 	int len;
@@ -1756,6 +1838,7 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 			continue;
 
 		len = strlen(field->name);
+		arg->is_enum = false;
 
 		if (strcmp(field->type, "const char *") == 0 &&
 		    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
@@ -1782,6 +1865,8 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 			 * 7 unsigned long
 			 */
 			arg->scnprintf = SCA_FD;
+		} else if (strstr(field->type, "enum") && use_btf != NULL) {
+			*use_btf = arg->is_enum = true;
 		} else {
 			const struct syscall_arg_fmt *fmt =
 				syscall_arg_fmt__find_by_name(field->name);
@@ -1798,7 +1883,8 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 
 static int syscall__set_arg_fmts(struct syscall *sc)
 {
-	struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args);
+	struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args,
+									  &sc->use_btf);
 
 	if (last_field)
 		sc->args_size = last_field->offset + last_field->size;
@@ -1811,6 +1897,7 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 	char tp_name[128];
 	struct syscall *sc;
 	const char *name = syscalltbl__name(trace->sctbl, id);
+	int err;
 
 #ifdef HAVE_SYSCALL_TABLE_SUPPORT
 	if (trace->syscalls.table == NULL) {
@@ -1883,7 +1970,13 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
 	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
 
-	return syscall__set_arg_fmts(sc);
+	err = syscall__set_arg_fmts(sc);
+
+	/* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */
+	if (sc->use_btf)
+		trace__load_vmlinux_btf(trace);
+
+	return err;
 }
 
 static int evsel__init_tp_arg_scnprintf(struct evsel *evsel)
@@ -1891,7 +1984,7 @@ static int evsel__init_tp_arg_scnprintf(struct evsel *evsel)
 	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
 
 	if (fmt != NULL) {
-		syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields);
+		syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, NULL);
 		return 0;
 	}
 
@@ -2103,6 +2196,15 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 			if (trace->show_arg_names)
 				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
 
+			if (sc->arg_fmt[arg.idx].is_enum) {
+				size_t p = trace__btf_enum_scnprintf(trace, &sc->arg_fmt[arg.idx], bf + printed,
+								     size - printed, val, field->type);
+				if (p) {
+					printed += p;
+					continue;
+				}
+			}
+
 			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
 								  bf + printed, size - printed, &arg, val);
 		}
-- 
cgit v1.2.3


From 607bbdb49ccb646be707a0f2ac1d78f5a7c3de7c Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Tue, 25 Jun 2024 02:13:40 +0800
Subject: perf trace: Augment non-syscall tracepoints with enum arguments with
 BTF

Before:

perf $ ./perf trace -e timer:hrtimer_start --max-events=1
     0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff974466c25f18, function: 0xffffffff89da5be0, expires: 377432432256753, softexpires: 377432432256753, mode: 10)

After:

perf $ ./perf trace -e timer:hrtimer_start --max-events=1
     0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff9498a6ca5f18, function: 0xffffffffa77a5be0, expires: 4382442895089, softexpires: 4382442895089, mode: HRTIMER_MODE_ABS_PINNED_HARD)

in which HRTIMER_MODE_ABS_PINNED_HARD is:

perf $ pahole hrtimer_mode
enum hrtimer_mode {
        HRTIMER_MODE_ABS             = 0,
        HRTIMER_MODE_REL             = 1,
        HRTIMER_MODE_PINNED          = 2,
        HRTIMER_MODE_SOFT            = 4,
        HRTIMER_MODE_HARD            = 8,
        HRTIMER_MODE_ABS_PINNED      = 2,
        HRTIMER_MODE_REL_PINNED      = 3,
        HRTIMER_MODE_ABS_SOFT        = 4,
        HRTIMER_MODE_REL_SOFT        = 5,
        HRTIMER_MODE_ABS_PINNED_SOFT = 6,
        HRTIMER_MODE_REL_PINNED_SOFT = 7,
        HRTIMER_MODE_ABS_HARD        = 8,
        HRTIMER_MODE_REL_HARD        = 9,
        HRTIMER_MODE_ABS_PINNED_HARD = 10,
        HRTIMER_MODE_REL_PINNED_HARD = 11,
};

Can also be tested by

./perf trace -e pagemap:mm_lru_insertion,timer:hrtimer_start,timer:hrtimer_init,skb:kfree_skb --max-events=10

(Chose these 4 events because they happen quite frequently.)

However some enum arguments may not be contained in vmlinux BTF. To see
what enum arguments are supported, use:

vmlinux_dir $ bpftool btf dump file /sys/kernel/btf/vmlinux > vmlinux

vmlinux_dir $  while read l; do grep "ENUM '$l'" vmlinux; done < <(grep field:enum /sys/kernel/tracing/events/*/*/format | awk '{print $3}' | sort | uniq) | awk '{print $3}' | sed "s/'\(.*\)'/\1/g"
dev_pm_qos_req_type
error_detector
hrtimer_mode
i2c_slave_event
ieee80211_bss_type
lru_list
migrate_mode
nl80211_auth_type
nl80211_band
nl80211_iftype
numa_vmaskip_reason
pm_qos_req_action
pwm_polarity
skb_drop_reason
thermal_trip_type
xen_lazy_mode
xen_mc_extend_args
xen_mc_flush_reason
zone_type

And what tracepoints have these enum types as their arguments:

vmlinux_dir $ while read l; do grep "ENUM '$l'" vmlinux; done < <(grep field:enum /sys/kernel/tracing/events/*/*/format | awk '{print $3}' | sort | uniq) | awk '{print $3}' | sed "s/'\(.*\)'/\1/g" > good_enums

vmlinux_dir $ cat good_enums
dev_pm_qos_req_type
error_detector
hrtimer_mode
i2c_slave_event
ieee80211_bss_type
lru_list
migrate_mode
nl80211_auth_type
nl80211_band
nl80211_iftype
numa_vmaskip_reason
pm_qos_req_action
pwm_polarity
skb_drop_reason
thermal_trip_type
xen_lazy_mode
xen_mc_extend_args
xen_mc_flush_reason
zone_type

vmlinux_dir $ grep -f good_enums -l /sys/kernel/tracing/events/*/*/format
/sys/kernel/tracing/events/cfg80211/cfg80211_chandef_dfs_required/format
/sys/kernel/tracing/events/cfg80211/cfg80211_ch_switch_notify/format
/sys/kernel/tracing/events/cfg80211/cfg80211_ch_switch_started_notify/format
/sys/kernel/tracing/events/cfg80211/cfg80211_get_bss/format
/sys/kernel/tracing/events/cfg80211/cfg80211_ibss_joined/format
/sys/kernel/tracing/events/cfg80211/cfg80211_inform_bss_frame/format
/sys/kernel/tracing/events/cfg80211/cfg80211_radar_event/format
/sys/kernel/tracing/events/cfg80211/cfg80211_ready_on_channel_expired/format
/sys/kernel/tracing/events/cfg80211/cfg80211_ready_on_channel/format
/sys/kernel/tracing/events/cfg80211/cfg80211_reg_can_beacon/format
/sys/kernel/tracing/events/cfg80211/cfg80211_return_bss/format
/sys/kernel/tracing/events/cfg80211/cfg80211_tx_mgmt_expired/format
/sys/kernel/tracing/events/cfg80211/rdev_add_virtual_intf/format
/sys/kernel/tracing/events/cfg80211/rdev_auth/format
/sys/kernel/tracing/events/cfg80211/rdev_change_virtual_intf/format
/sys/kernel/tracing/events/cfg80211/rdev_channel_switch/format
/sys/kernel/tracing/events/cfg80211/rdev_connect/format
/sys/kernel/tracing/events/cfg80211/rdev_inform_bss/format
/sys/kernel/tracing/events/cfg80211/rdev_libertas_set_mesh_channel/format
/sys/kernel/tracing/events/cfg80211/rdev_mgmt_tx/format
/sys/kernel/tracing/events/cfg80211/rdev_remain_on_channel/format
/sys/kernel/tracing/events/cfg80211/rdev_return_chandef/format
/sys/kernel/tracing/events/cfg80211/rdev_return_int_survey_info/format
/sys/kernel/tracing/events/cfg80211/rdev_set_ap_chanwidth/format
/sys/kernel/tracing/events/cfg80211/rdev_set_monitor_channel/format
/sys/kernel/tracing/events/cfg80211/rdev_set_radar_background/format
/sys/kernel/tracing/events/cfg80211/rdev_start_ap/format
/sys/kernel/tracing/events/cfg80211/rdev_start_radar_detection/format
/sys/kernel/tracing/events/cfg80211/rdev_tdls_channel_switch/format
/sys/kernel/tracing/events/compaction/mm_compaction_defer_compaction/format
/sys/kernel/tracing/events/compaction/mm_compaction_deferred/format
/sys/kernel/tracing/events/compaction/mm_compaction_defer_reset/format
/sys/kernel/tracing/events/compaction/mm_compaction_finished/format
/sys/kernel/tracing/events/compaction/mm_compaction_kcompactd_wake/format
/sys/kernel/tracing/events/compaction/mm_compaction_suitable/format
/sys/kernel/tracing/events/compaction/mm_compaction_wakeup_kcompactd/format
/sys/kernel/tracing/events/error_report/error_report_end/format
/sys/kernel/tracing/events/i2c_slave/i2c_slave/format
/sys/kernel/tracing/events/migrate/mm_migrate_pages/format
/sys/kernel/tracing/events/migrate/mm_migrate_pages_start/format
/sys/kernel/tracing/events/pagemap/mm_lru_insertion/format
/sys/kernel/tracing/events/power/dev_pm_qos_add_request/format
/sys/kernel/tracing/events/power/dev_pm_qos_remove_request/format
/sys/kernel/tracing/events/power/dev_pm_qos_update_request/format
/sys/kernel/tracing/events/power/pm_qos_update_flags/format
/sys/kernel/tracing/events/power/pm_qos_update_target/format
/sys/kernel/tracing/events/pwm/pwm_apply/format
/sys/kernel/tracing/events/pwm/pwm_get/format
/sys/kernel/tracing/events/sched/sched_skip_vma_numa/format
/sys/kernel/tracing/events/skb/kfree_skb/format
/sys/kernel/tracing/events/thermal/thermal_zone_trip/format
/sys/kernel/tracing/events/timer/hrtimer_init/format
/sys/kernel/tracing/events/timer/hrtimer_start/format
/sys/kernel/tracing/events/xen/xen_mc_batch/format
/sys/kernel/tracing/events/xen/xen_mc_extend_args/format
/sys/kernel/tracing/events/xen/xen_mc_flush_reason/format
/sys/kernel/tracing/events/xen/xen_mc_issue/format

Committer testing:

  root@x1:~# perf trace -e timer:hrtimer_start --max-events=2
       0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff8d4eff225050, function: 0xffffffff9e22ddd0, expires: 241152380000000, softexpires: 241152380000000, mode: HRTIMER_MODE_ABS)
       0.028 :0/0 timer:hrtimer_start(hrtimer: 0xffff8d4eff225050, function: 0xffffffff9e22ddd0, expires: 241153654000000, softexpires: 241153654000000, mode: HRTIMER_MODE_ABS_PINNED_HARD)
  root@x1:~#

Suggested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Reviewed-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/20240615032743.112750-1-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20240624181345.124764-4-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 1391564911d9..5618feb7d01a 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1979,12 +1979,12 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 	return err;
 }
 
-static int evsel__init_tp_arg_scnprintf(struct evsel *evsel)
+static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf)
 {
 	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
 
 	if (fmt != NULL) {
-		syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, NULL);
+		syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, use_btf);
 		return 0;
 	}
 
@@ -2188,7 +2188,8 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 			 * property isn't set.
 			 */
 			if (val == 0 && !trace->show_zeros &&
-			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero))
+			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) &&
+			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].is_enum))
 				continue;
 
 			printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
@@ -2893,7 +2894,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
 		val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
 
 		/* Suppress this argument if its value is zero and show_zero property isn't set. */
-		if (val == 0 && !trace->show_zeros && !arg->show_zero)
+		if (val == 0 && !trace->show_zeros && !arg->show_zero && !arg->is_enum)
 			continue;
 
 		printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
@@ -2901,6 +2902,15 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
 		if (trace->show_arg_names)
 			printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
 
+		if (arg->is_enum) {
+			size_t p = trace__btf_enum_scnprintf(trace, arg, bf + printed,
+							     size - printed, val, field->type);
+			if (p) {
+				printed += p;
+				continue;
+			}
+		}
+
 		printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
 	}
 
@@ -4553,7 +4563,7 @@ static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name)
 	}
 }
 
-static int evlist__set_syscall_tp_fields(struct evlist *evlist)
+static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf)
 {
 	struct evsel *evsel;
 
@@ -4562,7 +4572,7 @@ static int evlist__set_syscall_tp_fields(struct evlist *evlist)
 			continue;
 
 		if (strcmp(evsel->tp_format->system, "syscalls")) {
-			evsel__init_tp_arg_scnprintf(evsel);
+			evsel__init_tp_arg_scnprintf(evsel, use_btf);
 			continue;
 		}
 
@@ -5040,11 +5050,16 @@ skip_augmentation:
 	}
 
 	if (trace.evlist->core.nr_entries > 0) {
+		bool use_btf = false;
+
 		evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
-		if (evlist__set_syscall_tp_fields(trace.evlist)) {
+		if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
 			perror("failed to set syscalls:* tracepoint fields");
 			goto out;
 		}
+
+		if (use_btf)
+			trace__load_vmlinux_btf(&trace);
 	}
 
 	if (trace.sort_events) {
-- 
cgit v1.2.3


From 95586588868a04ea2bbfa144cf473bea9bb86110 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Tue, 25 Jun 2024 02:13:41 +0800
Subject: perf trace: Filter enum arguments with enum names

Before:

perf $ ./perf trace -e timer:hrtimer_start --filter='mode!=HRTIMER_MODE_ABS_PINNED_HARD' --max-events=1
No resolver (strtoul) for "mode" in "timer:hrtimer_start", can't set filter "(mode!=HRTIMER_MODE_ABS_PINNED_HARD) && (common_pid != 281988)"

After:

perf $ ./perf trace -e timer:hrtimer_start --filter='mode!=HRTIMER_MODE_ABS_PINNED_HARD' --max-events=1
     0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff9498a6ca5f18, function: 0xffffffffa77a5be0, expires: 12351248764875, softexpires: 12351248764875, mode: HRTIMER_MODE_ABS)

&& and ||:

perf $ ./perf trace -e timer:hrtimer_start --filter='mode != HRTIMER_MODE_ABS_PINNED_HARD && mode != HRTIMER_MODE_ABS' --max-events=1
     0.000 Hyprland/534 timer:hrtimer_start(hrtimer: 0xffff9497801a84d0, function: 0xffffffffc04cdbe0, expires: 12639434638458, softexpires: 12639433638458, mode: HRTIMER_MODE_REL)

perf $ ./perf trace -e timer:hrtimer_start --filter='mode == HRTIMER_MODE_REL || mode == HRTIMER_MODE_PINNED' --max-events=1
     0.000 ldlck-test/60639 timer:hrtimer_start(hrtimer: 0xffffb16404ee7bf8, function: 0xffffffffa7790420, expires: 12772614418016, softexpires: 12772614368016, mode: HRTIMER_MODE_REL)

Switching it up, using both enum name and integer value(--filter='mode == HRTIMER_MODE_ABS_PINNED_HARD || mode == 0'):

perf $ ./perf trace -e timer:hrtimer_start --filter='mode == HRTIMER_MODE_ABS_PINNED_HARD || mode == 0' --max-events=3
     0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff9498a6ca5f18, function: 0xffffffffa77a5be0, expires: 12601748739825, softexpires: 12601748739825, mode: HRTIMER_MODE_ABS_PINNED_HARD)
     0.036 :0/0 timer:hrtimer_start(hrtimer: 0xffff9498a6ca5f18, function: 0xffffffffa77a5be0, expires: 12518758748124, softexpires: 12518758748124, mode: HRTIMER_MODE_ABS_PINNED_HARD)
     0.172 tmux: server/41881 timer:hrtimer_start(hrtimer: 0xffffb164081e7838, function: 0xffffffffa7790420, expires: 12518768255836, softexpires: 12518768205836, mode: HRTIMER_MODE_ABS)

P.S.
perf $ pahole hrtimer_mode
enum hrtimer_mode {
        HRTIMER_MODE_ABS             = 0,
        HRTIMER_MODE_REL             = 1,
        HRTIMER_MODE_PINNED          = 2,
        HRTIMER_MODE_SOFT            = 4,
        HRTIMER_MODE_HARD            = 8,
        HRTIMER_MODE_ABS_PINNED      = 2,
        HRTIMER_MODE_REL_PINNED      = 3,
        HRTIMER_MODE_ABS_SOFT        = 4,
        HRTIMER_MODE_REL_SOFT        = 5,
        HRTIMER_MODE_ABS_PINNED_SOFT = 6,
        HRTIMER_MODE_REL_PINNED_SOFT = 7,
        HRTIMER_MODE_ABS_HARD        = 8,
        HRTIMER_MODE_REL_HARD        = 9,
        HRTIMER_MODE_ABS_PINNED_HARD = 10,
        HRTIMER_MODE_REL_PINNED_HARD = 11,
};

Committer testing:

  root@x1:~# perf trace -e timer:hrtimer_start --filter='mode != HRTIMER_MODE_ABS' --max-events=2
       0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff8d4eff2a5050, function: 0xffffffff9e22ddd0, expires: 241502326000000, softexpires: 241502326000000, mode: HRTIMER_MODE_ABS_PINNED_HARD)
  18446744073709.488 :0/0 timer:hrtimer_start(hrtimer: 0xffff8d4eff425050, function: 0xffffffff9e22ddd0, expires: 241501814000000, softexpires: 241501814000000, mode: HRTIMER_MODE_ABS_PINNED_HARD)
  root@x1:~# perf trace -e timer:hrtimer_start --filter='mode != HRTIMER_MODE_ABS && mode != HRTIMER_MODE_ABS_PINNED_HARD' --max-events=2
       0.000 podman/510644 timer:hrtimer_start(hrtimer: 0xffffa2024f5f7dd0, function: 0xffffffff9e2170c0, expires: 241530497418194, softexpires: 241530497368194, mode: HRTIMER_MODE_REL)
      40.251 gnome-shell/2484 timer:hrtimer_start(hrtimer: 0xffff8d48bda17650, function: 0xffffffffc0661550, expires: 241550528619247, softexpires: 241550527619247, mode: HRTIMER_MODE_REL)
  root@x1:~# perf trace -v -e timer:hrtimer_start --filter='mode != HRTIMER_MODE_ABS && mode != HRTIMER_MODE_ABS_PINNED_HARD && mode != HRTIMER_MODE_REL' --max-events=2
  Using CPUID GenuineIntel-6-BA-3
  vmlinux BTF loaded
  <SNIP>
  0
  0xa
  0x1
  New filter for timer:hrtimer_start: (mode != 0 && mode != 0xa && mode != 0x1) && (common_pid != 524049 && common_pid != 4041)
  mmap size 528384B
  ^Croot@x1:~#

Suggested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/ZnCcliuecJABD5FN@x1
Link: https://lore.kernel.org/r/20240624181345.124764-5-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 62 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 5618feb7d01a..e664001d5ed7 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -932,6 +932,37 @@ static int syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, stru
 	return arg_fmt->type == NULL ? -1 : 0;
 }
 
+static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
+{
+	const struct btf_type *bt;
+	char *type = arg->parm;
+	struct btf_enum *be;
+	struct btf *btf;
+
+	trace__load_vmlinux_btf(arg->trace);
+
+	btf = arg->trace->btf;
+	if (btf == NULL)
+		return false;
+
+	if (syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type) < 0)
+		return false;
+
+	bt = arg->fmt->type;
+	be = btf_enum(bt);
+	for (int i = 0; i < btf_vlen(bt); ++i, ++be) {
+		const char *name = btf__name_by_offset(btf, be->name_off);
+		int max_len = max(size, strlen(name));
+
+		if (strncmp(name, bf, max_len) == 0) {
+			*val = be->val;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val)
 {
 	struct btf_enum *be = btf_enum(type);
@@ -965,8 +996,16 @@ static size_t trace__btf_enum_scnprintf(struct trace *trace __maybe_unused, stru
 {
 	return 0;
 }
+
+static bool syscall_arg__strtoul_btf_enum(char *bf __maybe_unused, size_t size __maybe_unused,
+					  struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused)
+{
+	return false;
+}
 #endif // HAVE_LIBBPF_SUPPORT
 
+#define STUL_BTF_ENUM syscall_arg__strtoul_btf_enum
+
 #define STRARRAY(name, array) \
 	  { .scnprintf	= SCA_STRARRAY, \
 	    .strtoul	= STUL_STRARRAY, \
@@ -1867,6 +1906,7 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 			arg->scnprintf = SCA_FD;
 		} else if (strstr(field->type, "enum") && use_btf != NULL) {
 			*use_btf = arg->is_enum = true;
+			arg->strtoul = STUL_BTF_ENUM;
 		} else {
 			const struct syscall_arg_fmt *fmt =
 				syscall_arg_fmt__find_by_name(field->name);
@@ -3792,7 +3832,8 @@ static int ordered_events__deliver_event(struct ordered_events *oe,
 	return __trace__deliver_event(trace, event->event);
 }
 
-static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg)
+static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg,
+								   char **type)
 {
 	struct tep_format_field *field;
 	struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel);
@@ -3801,13 +3842,15 @@ static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel
 		return NULL;
 
 	for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt)
-		if (strcmp(field->name, arg) == 0)
+		if (strcmp(field->name, arg) == 0) {
+			*type = field->type;
 			return fmt;
+		}
 
 	return NULL;
 }
 
-static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel *evsel)
+static int trace__expand_filter(struct trace *trace, struct evsel *evsel)
 {
 	char *tok, *left = evsel->filter, *new_filter = evsel->filter;
 
@@ -3840,14 +3883,14 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel
 			struct syscall_arg_fmt *fmt;
 			int left_size = tok - left,
 			    right_size = right_end - right;
-			char arg[128];
+			char arg[128], *type;
 
 			while (isspace(left[left_size - 1]))
 				--left_size;
 
 			scnprintf(arg, sizeof(arg), "%.*s", left_size, left);
 
-			fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg);
+			fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type);
 			if (fmt == NULL) {
 				pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
 				       arg, evsel->name, evsel->filter);
@@ -3860,9 +3903,16 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel
 			if (fmt->strtoul) {
 				u64 val;
 				struct syscall_arg syscall_arg = {
-					.parm = fmt->parm,
+					.trace = trace,
+					.fmt   = fmt,
 				};
 
+				if (fmt->is_enum) {
+					syscall_arg.parm = type;
+				} else {
+					syscall_arg.parm = fmt->parm;
+				}
+
 				if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
 					char *n, expansion[19];
 					int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
-- 
cgit v1.2.3


From ba6a9018502eecb94e67001deeb48406fa71f916 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Wed, 31 Jul 2024 08:37:25 +0200
Subject: selftests/bpf: do not disable /dev/null device access in cgroup dev
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_dev_cgroup currently loads a small bpf program allowing any access on
urandom and zero devices, disabling access to any other device. It makes
migrating this test to test_progs impossible, since this one manipulates
extensively /dev/null.

Allow /dev/null manipulation in dev_cgroup program to make its usage in
test_progs framework possible. Update test_dev_cgroup.c as well to match
this change while it has not been removed.

Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-1-849425d90de6@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/dev_cgroup.c |  4 ++--
 tools/testing/selftests/bpf/test_dev_cgroup.c  | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/dev_cgroup.c b/tools/testing/selftests/bpf/progs/dev_cgroup.c
index 79b54a4fa244..c1dfbd2b56fc 100644
--- a/tools/testing/selftests/bpf/progs/dev_cgroup.c
+++ b/tools/testing/selftests/bpf/progs/dev_cgroup.c
@@ -41,14 +41,14 @@ int bpf_prog1(struct bpf_cgroup_dev_ctx *ctx)
 	bpf_trace_printk(fmt, sizeof(fmt), ctx->major, ctx->minor);
 #endif
 
-	/* Allow access to /dev/zero and /dev/random.
+	/* Allow access to /dev/null and /dev/urandom.
 	 * Forbid everything else.
 	 */
 	if (ctx->major != 1 || type != BPF_DEVCG_DEV_CHAR)
 		return 0;
 
 	switch (ctx->minor) {
-	case 5: /* 1:5 /dev/zero */
+	case 3: /* 1:3 /dev/null */
 	case 9: /* 1:9 /dev/urandom */
 		return 1;
 	}
diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c
index adeaf63cb6fa..33f544f0005a 100644
--- a/tools/testing/selftests/bpf/test_dev_cgroup.c
+++ b/tools/testing/selftests/bpf/test_dev_cgroup.c
@@ -54,25 +54,25 @@ int main(int argc, char **argv)
 		goto err;
 	}
 
-	/* All operations with /dev/zero and and /dev/urandom are allowed,
+	/* All operations with /dev/null and /dev/urandom are allowed,
 	 * everything else is forbidden.
 	 */
-	assert(system("rm -f /tmp/test_dev_cgroup_null") == 0);
-	assert(system("mknod /tmp/test_dev_cgroup_null c 1 3"));
-	assert(system("rm -f /tmp/test_dev_cgroup_null") == 0);
-
-	/* /dev/zero is whitelisted */
 	assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0);
-	assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5") == 0);
+	assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5"));
 	assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0);
 
-	assert(system("dd if=/dev/urandom of=/dev/zero count=64") == 0);
+	/* /dev/null is whitelisted */
+	assert(system("rm -f /tmp/test_dev_cgroup_null") == 0);
+	assert(system("mknod /tmp/test_dev_cgroup_null c 1 3") == 0);
+	assert(system("rm -f /tmp/test_dev_cgroup_null") == 0);
+
+	assert(system("dd if=/dev/urandom of=/dev/null count=64") == 0);
 
 	/* src is allowed, target is forbidden */
 	assert(system("dd if=/dev/urandom of=/dev/full count=64"));
 
 	/* src is forbidden, target is allowed */
-	assert(system("dd if=/dev/random of=/dev/zero count=64"));
+	assert(system("dd if=/dev/random of=/dev/null count=64"));
 
 	error = 0;
 	printf("test_dev_cgroup:PASS\n");
-- 
cgit v1.2.3


From d83d8230e415ecf727f6b88c0ae55193f149d650 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Wed, 31 Jul 2024 08:37:26 +0200
Subject: selftests/bpf: convert test_dev_cgroup to test_progs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_dev_cgroup is defined as a standalone test program, and so is not
executed in CI.

Convert it to test_progs framework so it is tested automatically in CI, and
remove the old test. In order to be able to run it in test_progs, /dev/null
must remain usable, so change the new test to test operations on devices
1:3 as valid, and operations on devices 1:5 (/dev/zero) as invalid.

Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-2-849425d90de6@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore             |   1 -
 tools/testing/selftests/bpf/Makefile               |   2 -
 .../testing/selftests/bpf/prog_tests/cgroup_dev.c  | 121 +++++++++++++++++++++
 tools/testing/selftests/bpf/test_dev_cgroup.c      |  85 ---------------
 4 files changed, 121 insertions(+), 88 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_dev.c
 delete mode 100644 tools/testing/selftests/bpf/test_dev_cgroup.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 4e4aae8aa7ec..8f14d8faeb0b 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -9,7 +9,6 @@ test_lpm_map
 test_tag
 FEATURE-DUMP.libbpf
 fixdep
-test_dev_cgroup
 /test_progs
 /test_progs-no_alu32
 /test_progs-bpf_gcc
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 774c6270e377..f54185e96a95 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -67,7 +67,6 @@ endif
 
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-	test_dev_cgroup \
 	test_sock test_sockmap get_cgroup_id_user \
 	test_cgroup_storage \
 	test_tcpnotify_user test_sysctl \
@@ -292,7 +291,6 @@ JSON_WRITER		:= $(OUTPUT)/json_writer.o
 CAP_HELPERS	:= $(OUTPUT)/cap_helpers.o
 NETWORK_HELPERS := $(OUTPUT)/network_helpers.o
 
-$(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c
new file mode 100644
index 000000000000..8661e145ba84
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <errno.h>
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+#include "dev_cgroup.skel.h"
+
+#define TEST_CGROUP "/test-bpf-based-device-cgroup/"
+#define TEST_BUFFER_SIZE 64
+
+static void test_mknod(const char *path, mode_t mode, int dev_major,
+		       int dev_minor, int expected_ret, int expected_errno)
+{
+	int ret;
+
+	unlink(path);
+	ret = mknod(path, mode, makedev(dev_major, dev_minor));
+	ASSERT_EQ(ret, expected_ret, "mknod");
+	if (expected_ret)
+		ASSERT_EQ(errno, expected_errno, "mknod errno");
+	else
+		unlink(path);
+}
+
+static void test_read(const char *path, char *buf, int buf_size,
+		      int expected_ret, int expected_errno)
+{
+	int ret, fd;
+
+	fd = open(path, O_RDONLY);
+
+	/* A bare open on unauthorized device should fail */
+	if (expected_ret < 0) {
+		ASSERT_EQ(fd, expected_ret, "open ret for read");
+		ASSERT_EQ(errno, expected_errno, "open errno for read");
+		if (fd >= 0)
+			close(fd);
+		return;
+	}
+
+	if (!ASSERT_OK_FD(fd, "open ret for read"))
+		return;
+
+	ret = read(fd, buf, buf_size);
+	ASSERT_EQ(ret, expected_ret, "read");
+
+	close(fd);
+}
+
+static void test_write(const char *path, char *buf, int buf_size,
+		       int expected_ret, int expected_errno)
+{
+	int ret, fd;
+
+	fd = open(path, O_WRONLY);
+
+	/* A bare open on unauthorized device should fail */
+	if (expected_ret < 0) {
+		ASSERT_EQ(fd, expected_ret, "open ret for write");
+		ASSERT_EQ(errno, expected_errno, "open errno for write");
+		if (fd >= 0)
+			close(fd);
+		return;
+	}
+
+	if (!ASSERT_OK_FD(fd, "open ret for write"))
+		return;
+
+	ret = write(fd, buf, buf_size);
+	ASSERT_EQ(ret, expected_ret, "write");
+
+	close(fd);
+}
+
+void test_cgroup_dev(void)
+{
+	char buf[TEST_BUFFER_SIZE] = "some random test data";
+	struct dev_cgroup *skel;
+	int cgroup_fd;
+
+	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+	if (!ASSERT_OK_FD(cgroup_fd, "cgroup switch"))
+		return;
+
+	skel = dev_cgroup__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "load program"))
+		goto cleanup_cgroup;
+
+	skel->links.bpf_prog1 =
+		bpf_program__attach_cgroup(skel->progs.bpf_prog1, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links.bpf_prog1, "attach_program"))
+		goto cleanup_progs;
+
+	if (test__start_subtest("allow-mknod"))
+		test_mknod("/dev/test_dev_cgroup_null", S_IFCHR, 1, 3, 0, 0);
+
+	if (test__start_subtest("allow-read"))
+		test_read("/dev/urandom", buf, TEST_BUFFER_SIZE,
+			  TEST_BUFFER_SIZE, 0);
+
+	if (test__start_subtest("allow-write"))
+		test_write("/dev/null", buf, TEST_BUFFER_SIZE,
+			   TEST_BUFFER_SIZE, 0);
+
+	if (test__start_subtest("deny-mknod"))
+		test_mknod("/dev/test_dev_cgroup_zero", S_IFCHR, 1, 5, -1,
+			   EPERM);
+
+	if (test__start_subtest("deny-read"))
+		test_read("/dev/random", buf, TEST_BUFFER_SIZE, -1, EPERM);
+
+	if (test__start_subtest("deny-write"))
+		test_write("/dev/zero", buf, TEST_BUFFER_SIZE, -1, EPERM);
+
+cleanup_progs:
+	dev_cgroup__destroy(skel);
+cleanup_cgroup:
+	cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c
deleted file mode 100644
index 33f544f0005a..000000000000
--- a/tools/testing/selftests/bpf/test_dev_cgroup.c
+++ /dev/null
@@ -1,85 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (c) 2017 Facebook
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-#include <sys/time.h>
-
-#include <linux/bpf.h>
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "cgroup_helpers.h"
-#include "testing_helpers.h"
-
-#define DEV_CGROUP_PROG "./dev_cgroup.bpf.o"
-
-#define TEST_CGROUP "/test-bpf-based-device-cgroup/"
-
-int main(int argc, char **argv)
-{
-	struct bpf_object *obj;
-	int error = EXIT_FAILURE;
-	int prog_fd, cgroup_fd;
-	__u32 prog_cnt;
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	if (bpf_prog_test_load(DEV_CGROUP_PROG, BPF_PROG_TYPE_CGROUP_DEVICE,
-			  &obj, &prog_fd)) {
-		printf("Failed to load DEV_CGROUP program\n");
-		goto out;
-	}
-
-	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
-	if (cgroup_fd < 0) {
-		printf("Failed to create test cgroup\n");
-		goto out;
-	}
-
-	/* Attach bpf program */
-	if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_DEVICE, 0)) {
-		printf("Failed to attach DEV_CGROUP program");
-		goto err;
-	}
-
-	if (bpf_prog_query(cgroup_fd, BPF_CGROUP_DEVICE, 0, NULL, NULL,
-			   &prog_cnt)) {
-		printf("Failed to query attached programs");
-		goto err;
-	}
-
-	/* All operations with /dev/null and /dev/urandom are allowed,
-	 * everything else is forbidden.
-	 */
-	assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0);
-	assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5"));
-	assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0);
-
-	/* /dev/null is whitelisted */
-	assert(system("rm -f /tmp/test_dev_cgroup_null") == 0);
-	assert(system("mknod /tmp/test_dev_cgroup_null c 1 3") == 0);
-	assert(system("rm -f /tmp/test_dev_cgroup_null") == 0);
-
-	assert(system("dd if=/dev/urandom of=/dev/null count=64") == 0);
-
-	/* src is allowed, target is forbidden */
-	assert(system("dd if=/dev/urandom of=/dev/full count=64"));
-
-	/* src is forbidden, target is allowed */
-	assert(system("dd if=/dev/random of=/dev/null count=64"));
-
-	error = 0;
-	printf("test_dev_cgroup:PASS\n");
-
-err:
-	cleanup_cgroup_environment();
-
-out:
-	return error;
-}
-- 
cgit v1.2.3


From 84cdbff4a93501b7199f0d2f1150961ee95c8582 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Wed, 31 Jul 2024 08:37:27 +0200
Subject: selftests/bpf: add wrong type test to cgroup dev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current cgroup_dev test mostly tests that device operation is accepted or
refused base on passed major/minor (and so, any operation performed during
test involves only char device)

Add a small subtest ensuring that the device type passed to bpf program
allows it to take decisions as well.

Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-3-849425d90de6@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/cgroup_dev.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c
index 8661e145ba84..5ab7547e38c0 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c
@@ -114,6 +114,10 @@ void test_cgroup_dev(void)
 	if (test__start_subtest("deny-write"))
 		test_write("/dev/zero", buf, TEST_BUFFER_SIZE, -1, EPERM);
 
+	if (test__start_subtest("deny-mknod-wrong-type"))
+		test_mknod("/dev/test_dev_cgroup_block", S_IFBLK, 1, 3, -1,
+			   EPERM);
+
 cleanup_progs:
 	dev_cgroup__destroy(skel);
 cleanup_cgroup:
-- 
cgit v1.2.3


From 45eb1bf4d726caff8f1dce7ac8f950012d5f64b1 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Wed, 10 Jul 2024 13:15:34 +0500
Subject: selftests: tpm2: redirect python unittest logs to stdout

The python unittest module writes all its output to stderr, even when
the run is clean. Redirect its output logs to stdout.

Cc: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/tpm2/test_async.sh | 2 +-
 tools/testing/selftests/tpm2/test_smoke.sh | 2 +-
 tools/testing/selftests/tpm2/test_space.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/tpm2/test_async.sh b/tools/testing/selftests/tpm2/test_async.sh
index 43bf5bd772fd..cf5a9c826097 100755
--- a/tools/testing/selftests/tpm2/test_async.sh
+++ b/tools/testing/selftests/tpm2/test_async.sh
@@ -7,4 +7,4 @@ ksft_skip=4
 [ -e /dev/tpm0 ] || exit $ksft_skip
 [ -e /dev/tpmrm0 ] || exit $ksft_skip
 
-python3 -m unittest -v tpm2_tests.AsyncTest
+python3 -m unittest -v tpm2_tests.AsyncTest 2>&1
diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh
index 58af963e5b55..20fa70f970a9 100755
--- a/tools/testing/selftests/tpm2/test_smoke.sh
+++ b/tools/testing/selftests/tpm2/test_smoke.sh
@@ -6,4 +6,4 @@ ksft_skip=4
 
 [ -e /dev/tpm0 ] || exit $ksft_skip
 
-python3 -m unittest -v tpm2_tests.SmokeTest
+python3 -m unittest -v tpm2_tests.SmokeTest 2>&1
diff --git a/tools/testing/selftests/tpm2/test_space.sh b/tools/testing/selftests/tpm2/test_space.sh
index 04c47b13fe8a..93894cbc89a8 100755
--- a/tools/testing/selftests/tpm2/test_space.sh
+++ b/tools/testing/selftests/tpm2/test_space.sh
@@ -6,4 +6,4 @@ ksft_skip=4
 
 [ -e /dev/tpmrm0 ] || exit $ksft_skip
 
-python3 -m unittest -v tpm2_tests.SpaceTest
+python3 -m unittest -v tpm2_tests.SpaceTest 2>&1
-- 
cgit v1.2.3


From 37ee7d1995701c1819e1cc984bdd111fe23c7f5f Mon Sep 17 00:00:00 2001
From: Chang Yu <marcus.yu.56@gmail.com>
Date: Tue, 23 Jul 2024 21:21:28 -0700
Subject: selftests/exec: Fix grammar in an error message.

Replace "not ... nor" in the error message with "neither ... nor".

Signed-off-by: Chang Yu <marcus.yu.56@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/exec/execveat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c
index 6418ded40bdd..071e03532cba 100644
--- a/tools/testing/selftests/exec/execveat.c
+++ b/tools/testing/selftests/exec/execveat.c
@@ -117,7 +117,7 @@ static int check_execveat_invoked_rc(int fd, const char *path, int flags,
 	}
 	if ((WEXITSTATUS(status) != expected_rc) &&
 	    (WEXITSTATUS(status) != expected_rc2)) {
-		ksft_print_msg("child %d exited with %d not %d nor %d\n",
+		ksft_print_msg("child %d exited with %d neither %d nor %d\n",
 			       child, WEXITSTATUS(status), expected_rc,
 			       expected_rc2);
 		ksft_test_result_fail("%s\n", test_name);
-- 
cgit v1.2.3


From 0b631ed3ce922b34dad4938215f84e5321f7e524 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <shreeya.patel@collabora.com>
Date: Tue, 16 Jul 2024 00:56:34 +0530
Subject: kselftest: cpufreq: Add RTC wakeup alarm

Add RTC wakeup alarm for devices to resume after specific time interval.
This improvement in the test will help in enabling this test
in the CI systems and will eliminate the need of manual intervention
for resuming back the devices after suspend/hibernation.

Signed-off-by: Shreeya Patel <shreeya.patel@collabora.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/cpufreq/cpufreq.sh | 15 +++++++++++++++
 tools/testing/selftests/cpufreq/main.sh    | 13 ++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cpufreq/cpufreq.sh b/tools/testing/selftests/cpufreq/cpufreq.sh
index a8b1dbc0a3a5..e350c521b467 100755
--- a/tools/testing/selftests/cpufreq/cpufreq.sh
+++ b/tools/testing/selftests/cpufreq/cpufreq.sh
@@ -231,6 +231,21 @@ do_suspend()
 
 		for i in `seq 1 $2`; do
 			printf "Starting $1\n"
+
+			if [ "$3" = "rtc" ]; then
+				if ! command -v rtcwake &> /dev/null; then
+					printf "rtcwake could not be found, please install it.\n"
+					return 1
+				fi
+
+				rtcwake -m $filename -s 15
+
+				if [ $? -ne 0 ]; then
+					printf "Failed to suspend using RTC wake alarm\n"
+					return 1
+				fi
+			fi
+
 			echo $filename > $SYSFS/power/state
 			printf "Came out of $1\n"
 
diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh
index a0eb84cf7167..f12ff7416e41 100755
--- a/tools/testing/selftests/cpufreq/main.sh
+++ b/tools/testing/selftests/cpufreq/main.sh
@@ -24,6 +24,8 @@ helpme()
 	[-t <basic: Basic cpufreq testing
 	     suspend: suspend/resume,
 	     hibernate: hibernate/resume,
+	     suspend_rtc: suspend/resume back using the RTC wakeup alarm,
+	     hibernate_rtc: hibernate/resume back using the RTC wakeup alarm,
 	     modtest: test driver or governor modules. Only to be used with -d or -g options,
 	     sptest1: Simple governor switch to produce lockdep.
 	     sptest2: Concurrent governor switch to produce lockdep.
@@ -76,7 +78,8 @@ parse_arguments()
 				helpme
 				;;
 
-			t) # --func_type (Function to perform: basic, suspend, hibernate, modtest, sptest1/2/3/4 (default: basic))
+			t) # --func_type (Function to perform: basic, suspend, hibernate,
+			   # suspend_rtc, hibernate_rtc, modtest, sptest1/2/3/4 (default: basic))
 				FUNC=$OPTARG
 				;;
 
@@ -121,6 +124,14 @@ do_test()
 		do_suspend "hibernate" 1
 		;;
 
+		"suspend_rtc")
+                do_suspend "suspend" 1 rtc
+                ;;
+
+                "hibernate_rtc")
+                do_suspend "hibernate" 1 rtc
+                ;;
+
 		"modtest")
 		# Do we have modules in place?
 		if [ -z $DRIVER_MOD ] && [ -z $GOVERNOR_MOD ]; then
-- 
cgit v1.2.3


From 958b1891846e768584a4d44aed9fabc6f9a445fb Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 31 Jul 2024 00:14:37 -0500
Subject: scx/selftests: Verify we can call create_dsq from prog_run

We already have some testcases verifying that we can call
BPF_PROG_TYPE_SYSCALL progs and invoke scx_bpf_exit(). Let's extend that to
also call scx_bpf_create_dsq() so we get coverage for that as well.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/prog_run.bpf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c
index fd2c8f12af16..6a4d7c48e3f2 100644
--- a/tools/testing/selftests/sched_ext/prog_run.bpf.c
+++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c
@@ -16,6 +16,7 @@ char _license[] SEC("license") = "GPL";
 SEC("syscall")
 int BPF_PROG(prog_run_syscall)
 {
+	scx_bpf_create_dsq(0, -1);
 	scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN");
 	return 0;
 }
-- 
cgit v1.2.3


From 3656e566cf03ab0f959b2bd6f8274ee9799641e6 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Tue, 25 Jun 2024 02:13:42 +0800
Subject: perf test: Add landlock workload

We'll use it to add a regression test for the BTF augmentation of enum
arguments for tracepoints in 'perf trace':

  root@x1:~# perf trace -e landlock_add_rule perf test -w landlock
       0.000 ( 0.009 ms): perf/747160 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_PATH_BENEATH, rule_attr: 0x7ffd8e258594, flags: 45) = -1 EINVAL (Invalid argument)
       0.011 ( 0.002 ms): perf/747160 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_NET_PORT, rule_attr: 0x7ffd8e2585a0, flags: 45) = -1 EINVAL (Invalid argument)
  root@x1:~#

Committer notes:

It was agreed on the discussion (see Link below) to shorten then name of
the workload from 'landlock_add_rule' to 'landlock', and I moved it to a
separate patch.

Also, to address a build failure from Namhyung, I stopped loading
linux/landlock.h and instead added the used defines, enums and types to
make this build in older systems. All we want is to emit the syscall and
intercept it.

Suggested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/CAH0uvohaypdTV6Z7O5QSK+va_qnhZ6BP6oSJ89s1c1E0CjgxDA@mail.gmail.com
Link: https://lore.kernel.org/r/20240624181345.124764-1-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20240624181345.124764-6-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c       |  1 +
 tools/perf/tests/tests.h              |  1 +
 tools/perf/tests/workloads/Build      |  1 +
 tools/perf/tests/workloads/landlock.c | 66 +++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+)
 create mode 100644 tools/perf/tests/workloads/landlock.c

(limited to 'tools')

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index c3d84b67ca8e..470a9709427d 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -152,6 +152,7 @@ static struct test_workload *workloads[] = {
 	&workload__sqrtloop,
 	&workload__brstack,
 	&workload__datasym,
+	&workload__landlock,
 };
 
 static int num_subtests(const struct test_suite *t)
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 3aa7701ee0e9..6ea2be86b7bf 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -205,6 +205,7 @@ DECLARE_WORKLOAD(leafloop);
 DECLARE_WORKLOAD(sqrtloop);
 DECLARE_WORKLOAD(brstack);
 DECLARE_WORKLOAD(datasym);
+DECLARE_WORKLOAD(landlock);
 
 extern const char *dso_to_test;
 extern const char *test_objdump_path;
diff --git a/tools/perf/tests/workloads/Build b/tools/perf/tests/workloads/Build
index 48bf0d3b0f3d..5af17206f04d 100644
--- a/tools/perf/tests/workloads/Build
+++ b/tools/perf/tests/workloads/Build
@@ -6,6 +6,7 @@ perf-test-y += leafloop.o
 perf-test-y += sqrtloop.o
 perf-test-y += brstack.o
 perf-test-y += datasym.o
+perf-test-y += landlock.o
 
 CFLAGS_sqrtloop.o         = -g -O0 -fno-inline -U_FORTIFY_SOURCE
 CFLAGS_leafloop.o         = -g -O0 -fno-inline -fno-omit-frame-pointer -U_FORTIFY_SOURCE
diff --git a/tools/perf/tests/workloads/landlock.c b/tools/perf/tests/workloads/landlock.c
new file mode 100644
index 000000000000..e2b5ef647c09
--- /dev/null
+++ b/tools/perf/tests/workloads/landlock.c
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <unistd.h>
+#include "../tests.h"
+
+/* This workload was initially added to test enum augmentation with BTF in perf
+ * trace because its the only syscall that has an enum argument. Since it is
+ * a recent addition to the Linux kernel (at the time of the introduction of this
+ * 'perf test' workload) we just add the required types and defines here instead
+ * of including linux/landlock, that isn't available in older systems.
+ *
+ * We are not interested in the the result of the syscall, just in intercepting
+ * its arguments.
+ */
+
+#ifndef __NR_landlock_add_rule
+#define __NR_landlock_add_rule 445
+#endif
+
+#ifndef LANDLOCK_ACCESS_FS_READ_FILE
+#define LANDLOCK_ACCESS_FS_READ_FILE	(1ULL << 2)
+
+#define LANDLOCK_RULE_PATH_BENEATH	1
+
+struct landlock_path_beneath_attr {
+        __u64 allowed_access;
+        __s32 parent_fd;
+};
+#endif
+
+#ifndef LANDLOCK_ACCESS_NET_CONNECT_TCP
+#define LANDLOCK_ACCESS_NET_CONNECT_TCP	(1ULL << 1)
+
+#define LANDLOCK_RULE_NET_PORT		2
+
+struct landlock_net_port_attr {
+	__u64 allowed_access;
+	__u64 port;
+};
+#endif
+
+static int landlock(int argc __maybe_unused, const char **argv __maybe_unused)
+{
+	int fd = 11, flags = 45;
+
+	struct landlock_path_beneath_attr path_beneath_attr = {
+		.allowed_access = LANDLOCK_ACCESS_FS_READ_FILE,
+		.parent_fd = 14,
+	};
+
+	struct landlock_net_port_attr net_port_attr = {
+		.port = 19,
+		.allowed_access = LANDLOCK_ACCESS_NET_CONNECT_TCP,
+	};
+
+	syscall(__NR_landlock_add_rule, fd, LANDLOCK_RULE_PATH_BENEATH,
+		&path_beneath_attr, flags);
+
+	syscall(__NR_landlock_add_rule, fd, LANDLOCK_RULE_NET_PORT,
+		&net_port_attr, flags);
+
+	return 0;
+}
+
+DEFINE_WORKLOAD(landlock);
-- 
cgit v1.2.3


From d66763fed30f0bd8cc8fb2c8c144c69fcb560bda Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Tue, 25 Jun 2024 02:13:43 +0800
Subject: perf test trace_btf_enum: Add regression test for the BTF
 augmentation of enums in 'perf trace'

Trace landlock_add_rule syscall to see if the output is desirable.

Trace the non-syscall tracepoint 'timer:hrtimer_init' and
'timer:hrtimer_start', see if the 'mode' argument is augmented,
the 'mode' enum argument has the prefix of 'HRTIMER_MODE_'
in its name.

Committer testing:

  root@x1:~# perf test enum
  124: perf trace enum augmentation tests                              : Ok
  root@x1:~# perf test -v enum
  124: perf trace enum augmentation tests                              : Ok
  root@x1:~# perf trace -e landlock_add_rule perf test -v enum
       0.000 ( 0.010 ms): perf/749827 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_PATH_BENEATH, rule_attr: 0x7ffd324171d4, flags: 45) = -1 EINVAL (Invalid argument)
       0.012 ( 0.002 ms): perf/749827 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_NET_PORT, rule_attr: 0x7ffd324171e0, flags: 45) = -1 EINVAL (Invalid argument)
     457.821 ( 0.007 ms): perf/749830 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_PATH_BENEATH, rule_attr: 0x7ffd4acd31e4, flags: 45) = -1 EINVAL (Invalid argument)
     457.832 ( 0.003 ms): perf/749830 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_NET_PORT, rule_attr: 0x7ffd4acd31f0, flags: 45) = -1 EINVAL (Invalid argument)
  124: perf trace enum augmentation tests                              : Ok
  root@x1:~#

Suggested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/20240619082042.4173621-6-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20240624181345.124764-7-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/trace_btf_enum.sh | 61 ++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100755 tools/perf/tests/shell/trace_btf_enum.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/trace_btf_enum.sh b/tools/perf/tests/shell/trace_btf_enum.sh
new file mode 100755
index 000000000000..7d407b52bea5
--- /dev/null
+++ b/tools/perf/tests/shell/trace_btf_enum.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+# perf trace enum augmentation tests
+# SPDX-License-Identifier: GPL-2.0
+
+err=0
+set -e
+
+syscall="landlock_add_rule"
+non_syscall="timer:hrtimer_init,timer:hrtimer_start"
+
+TESTPROG="perf test -w landlock"
+
+. "$(dirname $0)"/lib/probe.sh
+skip_if_no_perf_trace || exit 2
+
+check_vmlinux() {
+  echo "Checking if vmlinux exists"
+  if ! ls /sys/kernel/btf/vmlinux 1>/dev/null 2>&1
+  then
+    echo "trace+enum test [Skipped missing vmlinux BTF support]"
+    err=2
+  fi
+}
+
+trace_landlock() {
+  echo "Tracing syscall ${syscall}"
+
+  # test flight just to see if landlock_add_rule and libbpf are available
+  $TESTPROG
+
+  if perf trace -e $syscall $TESTPROG 2>&1 | \
+     grep -q -E ".*landlock_add_rule\(ruleset_fd: 11, rule_type: (LANDLOCK_RULE_PATH_BENEATH|LANDLOCK_RULE_NET_PORT), rule_attr: 0x[a-f0-9]+, flags: 45\) = -1.*"
+  then
+    err=0
+  else
+    err=1
+  fi
+}
+
+trace_non_syscall() {
+  echo "Tracing non-syscall tracepoint ${non-syscall}"
+  if perf trace -e $non_syscall --max-events=1 2>&1 | \
+     grep -q -E '.*timer:hrtimer_.*\(.*mode: HRTIMER_MODE_.*\)$'
+  then
+    err=0
+  else
+    err=1
+  fi
+}
+
+check_vmlinux
+
+if [ $err = 0 ]; then
+  trace_landlock
+fi
+
+if [ $err = 0 ]; then
+  trace_non_syscall
+fi
+
+exit $err
-- 
cgit v1.2.3


From 62284329b194606f73252935cc422cf6156e811a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 25 Jun 2024 02:13:44 +0800
Subject: perf trace: Introduce trace__btf_scnprintf()

To have a central place that will look at the BTF type and call the
right scnprintf routine or return zero, meaning BTF pretty printing
isn't available or not implemented for a specific type.

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Howard Chu <howardchu95@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240624181345.124764-8-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 49 +++++++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index e664001d5ed7..d9104fc4f61f 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -981,18 +981,28 @@ static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, c
 static size_t trace__btf_enum_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf,
 					size_t size, int val, char *type)
 {
-	if (trace->btf == NULL)
-		return 0;
-
 	if (syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type) < 0)
 		return 0;
 
 	return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
 }
+
+static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf,
+				   size_t size, int val, char *type)
+{
+	if (trace->btf == NULL)
+		return 0;
+
+	if (arg_fmt->is_enum)
+		return trace__btf_enum_scnprintf(trace, arg_fmt, bf, size, val, type);
+
+	return 0;
+}
+
 #else // HAVE_LIBBPF_SUPPORT
-static size_t trace__btf_enum_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg_fmt *arg_fmt __maybe_unused,
-					char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
-					char *type __maybe_unused)
+static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg_fmt *arg_fmt __maybe_unused,
+				   char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
+				   char *type __maybe_unused)
 {
 	return 0;
 }
@@ -2183,7 +2193,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 				      unsigned char *args, void *augmented_args, int augmented_args_size,
 				      struct trace *trace, struct thread *thread)
 {
-	size_t printed = 0;
+	size_t printed = 0, btf_printed;
 	unsigned long val;
 	u8 bit = 1;
 	struct syscall_arg arg = {
@@ -2237,13 +2247,11 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 			if (trace->show_arg_names)
 				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
 
-			if (sc->arg_fmt[arg.idx].is_enum) {
-				size_t p = trace__btf_enum_scnprintf(trace, &sc->arg_fmt[arg.idx], bf + printed,
-								     size - printed, val, field->type);
-				if (p) {
-					printed += p;
-					continue;
-				}
+			btf_printed = trace__btf_scnprintf(trace, &sc->arg_fmt[arg.idx], bf + printed,
+							   size - printed, val, field->type);
+			if (btf_printed) {
+				printed += btf_printed;
+				continue;
 			}
 
 			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
@@ -2892,7 +2900,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
 	size_t size = sizeof(bf);
 	struct tep_format_field *field = evsel->tp_format->format.fields;
 	struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel);
-	size_t printed = 0;
+	size_t printed = 0, btf_printed;
 	unsigned long val;
 	u8 bit = 1;
 	struct syscall_arg syscall_arg = {
@@ -2942,13 +2950,10 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
 		if (trace->show_arg_names)
 			printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
 
-		if (arg->is_enum) {
-			size_t p = trace__btf_enum_scnprintf(trace, arg, bf + printed,
-							     size - printed, val, field->type);
-			if (p) {
-				printed += p;
-				continue;
-			}
+		btf_printed = trace__btf_scnprintf(trace, arg, bf + printed, size - printed, val, field->type);
+		if (btf_printed) {
+			printed += btf_printed;
+			continue;
 		}
 
 		printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
-- 
cgit v1.2.3


From c3d747134cec49aa95aad22d14deffa43b2be37d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 25 Jun 2024 02:13:45 +0800
Subject: perf trace: Remove arg_fmt->is_enum, we can get that from the BTF
 type

This is to pave the way for other BTF types, i.e. we try to find BTF
type then use things like btf_is_enum(btf_type) that we cached to find
the right strtoul and scnprintf routines.

For now only enum is supported, all the other types simple return zero
for scnprintf which makes it have the same behaviour as when BTF isn't
available, i.e. fallback to no pretty printing. Ditto for strtoul.

  root@x1:~# perf test -v enum
  124: perf trace enum augmentation tests                              : Ok
  root@x1:~# perf test -v enum
  124: perf trace enum augmentation tests                              : Ok
  root@x1:~# perf test -v enum
  124: perf trace enum augmentation tests                              : Ok
  root@x1:~# perf test -v enum
  124: perf trace enum augmentation tests                              : Ok
  root@x1:~# perf test -v enum
  124: perf trace enum augmentation tests                              : Ok
  root@x1:~#

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Howard Chu <howardchu95@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240624181345.124764-9-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c       | 101 +++++++++++++++++++++------------------
 tools/perf/trace/beauty/beauty.h |   1 +
 2 files changed, 56 insertions(+), 46 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d9104fc4f61f..488c2cedc110 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -111,7 +111,6 @@ struct syscall_arg_fmt {
 	const char *name;
 	u16	   nr_entries; // for arrays
 	bool	   show_zero;
-	bool	   is_enum;
 #ifdef HAVE_LIBBPF_SUPPORT
 	const struct btf_type *type;
 #endif
@@ -910,33 +909,46 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 
 #ifdef HAVE_LIBBPF_SUPPORT
-static int syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
+static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
 {
 	int id;
 
-	// Already cached?
-	if (arg_fmt->type != NULL)
-		return 0;
-
 	type = strstr(type, "enum ");
 	if (type == NULL)
-		return -1;
+		return;
 
 	type += 5; // skip "enum " to get the enumeration name
 
 	id = btf__find_by_name(btf, type);
 	if (id < 0)
-		return -1;
+		return;
 
 	arg_fmt->type = btf__type_by_id(btf, id);
-	return arg_fmt->type == NULL ? -1 : 0;
 }
 
 static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
+{
+	const struct btf_type *bt = arg->fmt->type;
+	struct btf *btf = arg->trace->btf;
+	struct btf_enum *be = btf_enum(bt);
+
+	for (int i = 0; i < btf_vlen(bt); ++i, ++be) {
+		const char *name = btf__name_by_offset(btf, be->name_off);
+		int max_len = max(size, strlen(name));
+
+		if (strncmp(name, bf, max_len) == 0) {
+			*val = be->val;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
 {
 	const struct btf_type *bt;
-	char *type = arg->parm;
-	struct btf_enum *be;
+	char *type = arg->type_name;
 	struct btf *btf;
 
 	trace__load_vmlinux_btf(arg->trace);
@@ -945,20 +957,19 @@ static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_
 	if (btf == NULL)
 		return false;
 
-	if (syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type) < 0)
-		return false;
+	if (arg->fmt->type == NULL) {
+		// See if this is an enum
+		syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type);
+	}
 
+	// Now let's see if we have a BTF type resolved
 	bt = arg->fmt->type;
-	be = btf_enum(bt);
-	for (int i = 0; i < btf_vlen(bt); ++i, ++be) {
-		const char *name = btf__name_by_offset(btf, be->name_off);
-		int max_len = max(size, strlen(name));
+	if (bt == NULL)
+		return false;
 
-		if (strncmp(name, bf, max_len) == 0) {
-			*val = be->val;
-			return true;
-		}
-	}
+	// If it is an enum:
+	if (btf_is_enum(arg->fmt->type))
+		return syscall_arg__strtoul_btf_enum(bf, size, arg, val);
 
 	return false;
 }
@@ -978,23 +989,23 @@ static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, c
 	return 0;
 }
 
-static size_t trace__btf_enum_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf,
-					size_t size, int val, char *type)
-{
-	if (syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type) < 0)
-		return 0;
-
-	return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
-}
-
 static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf,
 				   size_t size, int val, char *type)
 {
 	if (trace->btf == NULL)
 		return 0;
 
-	if (arg_fmt->is_enum)
-		return trace__btf_enum_scnprintf(trace, arg_fmt, bf, size, val, type);
+	if (arg_fmt->type == NULL) {
+		// Check if this is an enum and if we have the BTF type for it.
+		syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type);
+	}
+
+	// Did we manage to find a BTF type for the syscall/tracepoint argument?
+	if (arg_fmt->type == NULL)
+		return 0;
+
+	if (btf_is_enum(arg_fmt->type))
+		return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
 
 	return 0;
 }
@@ -1007,14 +1018,14 @@ static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct sy
 	return 0;
 }
 
-static bool syscall_arg__strtoul_btf_enum(char *bf __maybe_unused, size_t size __maybe_unused,
+static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused,
 					  struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused)
 {
 	return false;
 }
 #endif // HAVE_LIBBPF_SUPPORT
 
-#define STUL_BTF_ENUM syscall_arg__strtoul_btf_enum
+#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type
 
 #define STRARRAY(name, array) \
 	  { .scnprintf	= SCA_STRARRAY, \
@@ -1887,7 +1898,6 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 			continue;
 
 		len = strlen(field->name);
-		arg->is_enum = false;
 
 		if (strcmp(field->type, "const char *") == 0 &&
 		    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
@@ -1915,8 +1925,8 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 			 */
 			arg->scnprintf = SCA_FD;
 		} else if (strstr(field->type, "enum") && use_btf != NULL) {
-			*use_btf = arg->is_enum = true;
-			arg->strtoul = STUL_BTF_ENUM;
+			*use_btf = true;
+			arg->strtoul = STUL_BTF_TYPE;
 		} else {
 			const struct syscall_arg_fmt *fmt =
 				syscall_arg_fmt__find_by_name(field->name);
@@ -2236,10 +2246,13 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 			/*
 			 * Suppress this argument if its value is zero and show_zero
 			 * property isn't set.
+			 *
+			 * If it has a BTF type, then override the zero suppression knob
+			 * as the common case is for zero in an enum to have an associated entry.
 			 */
 			if (val == 0 && !trace->show_zeros &&
 			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) &&
-			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].is_enum))
+			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE))
 				continue;
 
 			printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
@@ -2942,7 +2955,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
 		val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
 
 		/* Suppress this argument if its value is zero and show_zero property isn't set. */
-		if (val == 0 && !trace->show_zeros && !arg->show_zero && !arg->is_enum)
+		if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE)
 			continue;
 
 		printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
@@ -3910,14 +3923,10 @@ static int trace__expand_filter(struct trace *trace, struct evsel *evsel)
 				struct syscall_arg syscall_arg = {
 					.trace = trace,
 					.fmt   = fmt,
+					.type_name = type,
+					.parm = fmt->parm,
 				};
 
-				if (fmt->is_enum) {
-					syscall_arg.parm = type;
-				} else {
-					syscall_arg.parm = fmt->parm;
-				}
-
 				if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
 					char *n, expansion[19];
 					int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h
index 78d10d92d351..3ed11e18ee2d 100644
--- a/tools/perf/trace/beauty/beauty.h
+++ b/tools/perf/trace/beauty/beauty.h
@@ -113,6 +113,7 @@ struct syscall_arg {
 	struct thread *thread;
 	struct trace  *trace;
 	void	      *parm;
+	char	      *type_name;
 	u16	      len;
 	u8	      idx;
 	u8	      mask;
-- 
cgit v1.2.3


From e293f4b1e57fcc4d7d34b7a7a44ebec8ba8a1b7d Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 17 Jul 2024 09:57:17 -0700
Subject: perf test: Avoid python leak sanitizer test failures

Leak sanitizer will report memory leaks from python and the leak
sanitizer output causes tests to fail. For example:

  ```
  $ perf test 98 -v
   98: perf script tests:
  --- start ---
  test child forked, pid 1272962
  DB test
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.046 MB /tmp/perf-test-script.x0EktdCel8/perf.data (8 samples) ]
  call_path_table((1, 0, 0, 0)
  call_path_table((2, 1, 0, 140339508617447)
  call_path_table((3, 2, 2, 0)
  call_path_table((4, 3, 3, 0)
  call_path_table((5, 4, 4, 0)
  call_path_table((6, 5, 5, 0)
  call_path_table((7, 6, 6, 0)
  call_path_table((8, 7, 7, 0)
  call_path_table((9, 8, 8, 0)
  call_path_table((10, 9, 9, 0)
  call_path_table((11, 10, 10, 0)
  call_path_table((12, 11, 11, 0)
  call_path_table((13, 12, 1, 0)
  sample_table((1, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954119000, -1, 0, 0, 0, 0, 1, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1))
  sample_table((2, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954137053, -1, 0, 0, 0, 0, 1, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1))
  sample_table((3, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954140089, -1, 0, 0, 0, 0, 9, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1))
  sample_table((4, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954142376, -1, 0, 0, 0, 0, 155, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1))
  sample_table((5, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954144045, -1, 0, 0, 0, 0, 2493, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1))
  sample_table((6, 1, 1, 1, 1, 1, 12, 77, -2046828595, 588306954145722, -1, 0, 0, 0, 0, 47555, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1))
  call_path_table((14, 9, 14, 0)
  call_path_table((15, 14, 15, 0)
  call_path_table((16, 15, 0, -1040969624)
  call_path_table((17, 16, 16, 0)
  call_path_table((18, 17, 17, 0)
  call_path_table((19, 18, 18, 0)
  call_path_table((20, 19, 19, 0)
  call_path_table((21, 20, 13, 0)
  sample_table((7, 1, 1, 1, 2, 1, 13, 46, -2053700898, 588306954157436, -1, 0, 0, 0, 0, 964078, 0, 0, 128933429281, 0, 0, 21, 0, 0, 0, -1, -1))
  call_path_table((22, 1, 21, 0)
  call_path_table((23, 22, 22, 0)
  call_path_table((24, 23, 23, 0)
  call_path_table((25, 24, 24, 0)
  call_path_table((26, 25, 25, 0)
  call_path_table((27, 26, 26, 0)
  call_path_table((28, 27, 27, 0)
  call_path_table((29, 28, 28, 0)
  call_path_table((30, 29, 29, 0)
  call_path_table((31, 30, 30, 0)
  call_path_table((32, 31, 31, 0)
  call_path_table((33, 32, 32, 0)
  call_path_table((34, 33, 33, 0)
  call_path_table((35, 34, 20, 0)
  sample_table((8, 1, 1, 1, 2, 1, 20, 49, -2046878127, 588306954378624, -1, 0, 0, 0, 0, 2534317, 0, 0, 128933429281, 0, 0, 35, 0, 0, 0, -1, -1))

  =================================================================
  ==1272975==ERROR: LeakSanitizer: detected memory leaks

  Direct leak of 13628 byte(s) in 6 object(s) allocated from:
      #0 0x56354f60c092 in malloc (/tmp/perf/perf+0x29c092)
      #1 0x7ff25c7d02e7 in _PyObject_Malloc /build/python3.11/../Objects/obmalloc.c:2003:11
      #2 0x7ff25c7d02e7 in _PyObject_Malloc /build/python3.11/../Objects/obmalloc.c:1996:1

  SUMMARY: AddressSanitizer: 13628 byte(s) leaked in 6 allocation(s).
  --- Cleaning up ---
  ---- end(-1) ----
   98: perf script tests                                               : FAILED!
  ```

Disable leak sanitizer when running specific perf+python tests to
avoid this. This causes the tests to pass when run with leak
sanitizer.

Reviewed-by: Aditya Gupta <adityag@linux.ibm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/script.sh             | 3 +++
 tools/perf/tests/shell/test_task_analyzer.sh | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/script.sh b/tools/perf/tests/shell/script.sh
index c1a603653662..d3e2958d2242 100755
--- a/tools/perf/tests/shell/script.sh
+++ b/tools/perf/tests/shell/script.sh
@@ -61,7 +61,10 @@ _end_of_file_
 	esac
 
 	perf record $cmd_flags -o "${perfdatafile}" true
+	# Disable lsan to avoid warnings about python memory leaks.
+	export ASAN_OPTIONS=detect_leaks=0
 	perf script -i "${perfdatafile}" -s "${db_test}"
+	export ASAN_OPTIONS=
 	echo "DB test [Success]"
 }
 
diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh
index 92d15154ba79..cb02bf23e6a5 100755
--- a/tools/perf/tests/shell/test_task_analyzer.sh
+++ b/tools/perf/tests/shell/test_task_analyzer.sh
@@ -11,6 +11,9 @@ if [ -e "$perfdir/scripts/python/Perf-Trace-Util" ]; then
   export PERF_EXEC_PATH=$perfdir
 fi
 
+# Disable lsan to avoid warnings about python memory leaks.
+export ASAN_OPTIONS=detect_leaks=0
+
 cleanup() {
   rm -f perf.data
   rm -f perf.data.old
-- 
cgit v1.2.3


From 1d303deedb1057c6ca36fc0d1c0d7bf58a5b7322 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:44 +0530
Subject: perf annotate: Move the data structures related to register type to
 header file

Data type profiling uses instruction tracking by checking each
instruction and updating the register type state in some data
structures.

This is useful to find the data type in cases when the register state
gets transferred from one reg to another.

Example, in x86, "mov" instruction and in powerpc, "mr" instruction.

Currently these structures are defined in annotate-data.c and
instruction tracking is implemented only for x86.

Move these data structures to "annotate-data.h" header file so that
other arch implementations can use it in arch specific files as well.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-2-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 53 +-------------------------------------
 tools/perf/util/annotate-data.h | 56 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 52 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 965da6c0b542..a4c7f98a75e3 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -31,15 +31,6 @@
 
 static void delete_var_types(struct die_var_type *var_types);
 
-enum type_state_kind {
-	TSR_KIND_INVALID = 0,
-	TSR_KIND_TYPE,
-	TSR_KIND_PERCPU_BASE,
-	TSR_KIND_CONST,
-	TSR_KIND_POINTER,
-	TSR_KIND_CANARY,
-};
-
 #define pr_debug_dtp(fmt, ...)					\
 do {								\
 	if (debug_type_profile)					\
@@ -140,49 +131,7 @@ static void pr_debug_location(Dwarf_Die *die, u64 pc, int reg)
 	}
 }
 
-/*
- * Type information in a register, valid when @ok is true.
- * The @caller_saved registers are invalidated after a function call.
- */
-struct type_state_reg {
-	Dwarf_Die type;
-	u32 imm_value;
-	bool ok;
-	bool caller_saved;
-	u8 kind;
-};
-
-/* Type information in a stack location, dynamically allocated */
-struct type_state_stack {
-	struct list_head list;
-	Dwarf_Die type;
-	int offset;
-	int size;
-	bool compound;
-	u8 kind;
-};
-
-/* FIXME: This should be arch-dependent */
-#define TYPE_STATE_MAX_REGS  16
-
-/*
- * State table to maintain type info in each register and stack location.
- * It'll be updated when new variable is allocated or type info is moved
- * to a new location (register or stack).  As it'd be used with the
- * shortest path of basic blocks, it only maintains a single table.
- */
-struct type_state {
-	/* state of general purpose registers */
-	struct type_state_reg regs[TYPE_STATE_MAX_REGS];
-	/* state of stack location */
-	struct list_head stack_vars;
-	/* return value register */
-	int ret_reg;
-	/* stack pointer register */
-	int stack_reg;
-};
-
-static bool has_reg_type(struct type_state *state, int reg)
+bool has_reg_type(struct type_state *state, int reg)
 {
 	return (unsigned)reg < ARRAY_SIZE(state->regs);
 }
diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index 0a57d9f5ee78..cdb5cd8960bb 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -6,6 +6,11 @@
 #include <linux/compiler.h>
 #include <linux/rbtree.h>
 #include <linux/types.h>
+#include "annotate.h"
+
+#ifdef HAVE_DWARF_SUPPORT
+#include "debuginfo.h"
+#endif
 
 struct annotated_op_loc;
 struct debuginfo;
@@ -15,6 +20,15 @@ struct hist_entry;
 struct map_symbol;
 struct thread;
 
+enum type_state_kind {
+	TSR_KIND_INVALID = 0,
+	TSR_KIND_TYPE,
+	TSR_KIND_PERCPU_BASE,
+	TSR_KIND_CONST,
+	TSR_KIND_POINTER,
+	TSR_KIND_CANARY,
+};
+
 /**
  * struct annotated_member - Type of member field
  * @node: List entry in the parent list
@@ -143,6 +157,47 @@ struct annotated_data_stat {
 extern struct annotated_data_stat ann_data_stat;
 
 #ifdef HAVE_DWARF_SUPPORT
+/*
+ * Type information in a register, valid when @ok is true.
+ * The @caller_saved registers are invalidated after a function call.
+ */
+struct type_state_reg {
+	Dwarf_Die type;
+	u32 imm_value;
+	bool ok;
+	bool caller_saved;
+	u8 kind;
+};
+
+/* Type information in a stack location, dynamically allocated */
+struct type_state_stack {
+	struct list_head list;
+	Dwarf_Die type;
+	int offset;
+	int size;
+	bool compound;
+	u8 kind;
+};
+
+/* FIXME: This should be arch-dependent */
+#define TYPE_STATE_MAX_REGS  16
+
+/*
+ * State table to maintain type info in each register and stack location.
+ * It'll be updated when new variable is allocated or type info is moved
+ * to a new location (register or stack).  As it'd be used with the
+ * shortest path of basic blocks, it only maintains a single table.
+ */
+struct type_state {
+	/* state of general purpose registers */
+	struct type_state_reg regs[TYPE_STATE_MAX_REGS];
+	/* state of stack location */
+	struct list_head stack_vars;
+	/* return value register */
+	int ret_reg;
+	/* stack pointer register */
+	int stack_reg;
+};
 
 /* Returns data type at the location (ip, reg, offset) */
 struct annotated_data_type *find_data_type(struct data_loc_info *dloc);
@@ -160,6 +215,7 @@ void global_var_type__tree_delete(struct rb_root *root);
 
 int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel);
 
+bool has_reg_type(struct type_state *state, int reg);
 #else /* HAVE_DWARF_SUPPORT */
 
 static inline struct annotated_data_type *
-- 
cgit v1.2.3


From 782959ac248ac3cbac80f7476d4e0410662ff400 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:45 +0530
Subject: perf annotate: Add "update_insn_state" callback function to handle
 arch specific instruction tracking

Add "update_insn_state" callback to "struct arch" to handle instruction
tracking. Currently updating instruction state is handled by static
function "update_insn_state_x86" which is defined in "annotate-data.c".

Make this as a callback for specific arch and move to archs specific
file "arch/x86/annotate/instructions.c" . This will help to add helper
function for other platforms in file:
"arch/<platform>/annotate/instructions.c" and make changes/updates
easier.

Define callback "update_insn_state" as part of "struct arch", also make
some of the debug functions non-static so that it can be referenced from
other places.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-3-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/annotate/instructions.c | 377 +++++++++++++++++++++++++++
 tools/perf/util/annotate-data.c             | 391 +---------------------------
 tools/perf/util/annotate-data.h             |  23 ++
 tools/perf/util/disasm.c                    |   4 +
 tools/perf/util/disasm.h                    |  12 +
 5 files changed, 424 insertions(+), 383 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
index 5cdf457f5cbe..7b7d462c6c6b 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -206,3 +206,380 @@ static int x86__annotate_init(struct arch *arch, char *cpuid)
 	arch->initialized = true;
 	return err;
 }
+
+#ifdef HAVE_DWARF_SUPPORT
+static void update_insn_state_x86(struct type_state *state,
+				  struct data_loc_info *dloc, Dwarf_Die *cu_die,
+				  struct disasm_line *dl)
+{
+	struct annotated_insn_loc loc;
+	struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE];
+	struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET];
+	struct type_state_reg *tsr;
+	Dwarf_Die type_die;
+	u32 insn_offset = dl->al.offset;
+	int fbreg = dloc->fbreg;
+	int fboff = 0;
+
+	if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0)
+		return;
+
+	if (ins__is_call(&dl->ins)) {
+		struct symbol *func = dl->ops.target.sym;
+
+		if (func == NULL)
+			return;
+
+		/* __fentry__ will preserve all registers */
+		if (!strcmp(func->name, "__fentry__"))
+			return;
+
+		pr_debug_dtp("call [%x] %s\n", insn_offset, func->name);
+
+		/* Otherwise invalidate caller-saved registers after call */
+		for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) {
+			if (state->regs[i].caller_saved)
+				state->regs[i].ok = false;
+		}
+
+		/* Update register with the return type (if any) */
+		if (die_find_func_rettype(cu_die, func->name, &type_die)) {
+			tsr = &state->regs[state->ret_reg];
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->ok = true;
+
+			pr_debug_dtp("call [%x] return -> reg%d",
+				     insn_offset, state->ret_reg);
+			pr_debug_type_name(&type_die, tsr->kind);
+		}
+		return;
+	}
+
+	if (!strncmp(dl->ins.name, "add", 3)) {
+		u64 imm_value = -1ULL;
+		int offset;
+		const char *var_name = NULL;
+		struct map_symbol *ms = dloc->ms;
+		u64 ip = ms->sym->start + dl->al.offset;
+
+		if (!has_reg_type(state, dst->reg1))
+			return;
+
+		tsr = &state->regs[dst->reg1];
+
+		if (src->imm)
+			imm_value = src->offset;
+		else if (has_reg_type(state, src->reg1) &&
+			 state->regs[src->reg1].kind == TSR_KIND_CONST)
+			imm_value = state->regs[src->reg1].imm_value;
+		else if (src->reg1 == DWARF_REG_PC) {
+			u64 var_addr = annotate_calc_pcrel(dloc->ms, ip,
+							   src->offset, dl);
+
+			if (get_global_var_info(dloc, var_addr,
+						&var_name, &offset) &&
+			    !strcmp(var_name, "this_cpu_off") &&
+			    tsr->kind == TSR_KIND_CONST) {
+				tsr->kind = TSR_KIND_PERCPU_BASE;
+				imm_value = tsr->imm_value;
+			}
+		}
+		else
+			return;
+
+		if (tsr->kind != TSR_KIND_PERCPU_BASE)
+			return;
+
+		if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset,
+					&type_die) && offset == 0) {
+			/*
+			 * This is not a pointer type, but it should be treated
+			 * as a pointer.
+			 */
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_POINTER;
+			tsr->ok = true;
+
+			pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d",
+				     insn_offset, imm_value, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		return;
+	}
+
+	if (strncmp(dl->ins.name, "mov", 3))
+		return;
+
+	if (dloc->fb_cfa) {
+		u64 ip = dloc->ms->sym->start + dl->al.offset;
+		u64 pc = map__rip_2objdump(dloc->ms->map, ip);
+
+		if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0)
+			fbreg = -1;
+	}
+
+	/* Case 1. register to register or segment:offset to register transfers */
+	if (!src->mem_ref && !dst->mem_ref) {
+		if (!has_reg_type(state, dst->reg1))
+			return;
+
+		tsr = &state->regs[dst->reg1];
+		if (dso__kernel(map__dso(dloc->ms->map)) &&
+		    src->segment == INSN_SEG_X86_GS && src->imm) {
+			u64 ip = dloc->ms->sym->start + dl->al.offset;
+			u64 var_addr;
+			int offset;
+
+			/*
+			 * In kernel, %gs points to a per-cpu region for the
+			 * current CPU.  Access with a constant offset should
+			 * be treated as a global variable access.
+			 */
+			var_addr = src->offset;
+
+			if (var_addr == 40) {
+				tsr->kind = TSR_KIND_CANARY;
+				tsr->ok = true;
+
+				pr_debug_dtp("mov [%x] stack canary -> reg%d\n",
+					     insn_offset, dst->reg1);
+				return;
+			}
+
+			if (!get_global_var_type(cu_die, dloc, ip, var_addr,
+						 &offset, &type_die) ||
+			    !die_get_member_type(&type_die, offset, &type_die)) {
+				tsr->ok = false;
+				return;
+			}
+
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d",
+				     insn_offset, var_addr, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+			return;
+		}
+
+		if (src->imm) {
+			tsr->kind = TSR_KIND_CONST;
+			tsr->imm_value = src->offset;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n",
+				     insn_offset, tsr->imm_value, dst->reg1);
+			return;
+		}
+
+		if (!has_reg_type(state, src->reg1) ||
+		    !state->regs[src->reg1].ok) {
+			tsr->ok = false;
+			return;
+		}
+
+		tsr->type = state->regs[src->reg1].type;
+		tsr->kind = state->regs[src->reg1].kind;
+		tsr->ok = true;
+
+		pr_debug_dtp("mov [%x] reg%d -> reg%d",
+			     insn_offset, src->reg1, dst->reg1);
+		pr_debug_type_name(&tsr->type, tsr->kind);
+	}
+	/* Case 2. memory to register transers */
+	if (src->mem_ref && !dst->mem_ref) {
+		int sreg = src->reg1;
+
+		if (!has_reg_type(state, dst->reg1))
+			return;
+
+		tsr = &state->regs[dst->reg1];
+
+retry:
+		/* Check stack variables with offset */
+		if (sreg == fbreg) {
+			struct type_state_stack *stack;
+			int offset = src->offset - fboff;
+
+			stack = find_stack_state(state, offset);
+			if (stack == NULL) {
+				tsr->ok = false;
+				return;
+			} else if (!stack->compound) {
+				tsr->type = stack->type;
+				tsr->kind = stack->kind;
+				tsr->ok = true;
+			} else if (die_get_member_type(&stack->type,
+						       offset - stack->offset,
+						       &type_die)) {
+				tsr->type = type_die;
+				tsr->kind = TSR_KIND_TYPE;
+				tsr->ok = true;
+			} else {
+				tsr->ok = false;
+				return;
+			}
+
+			pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d",
+				     insn_offset, -offset, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/* And then dereference the pointer if it has one */
+		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
+			 state->regs[sreg].kind == TSR_KIND_TYPE &&
+			 die_deref_ptr_type(&state->regs[sreg].type,
+					    src->offset, &type_die)) {
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d",
+				     insn_offset, src->offset, sreg, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/* Or check if it's a global variable */
+		else if (sreg == DWARF_REG_PC) {
+			struct map_symbol *ms = dloc->ms;
+			u64 ip = ms->sym->start + dl->al.offset;
+			u64 addr;
+			int offset;
+
+			addr = annotate_calc_pcrel(ms, ip, src->offset, dl);
+
+			if (!get_global_var_type(cu_die, dloc, ip, addr, &offset,
+						 &type_die) ||
+			    !die_get_member_type(&type_die, offset, &type_die)) {
+				tsr->ok = false;
+				return;
+			}
+
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d",
+				     insn_offset, addr, dst->reg1);
+			pr_debug_type_name(&type_die, tsr->kind);
+		}
+		/* And check percpu access with base register */
+		else if (has_reg_type(state, sreg) &&
+			 state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
+			u64 ip = dloc->ms->sym->start + dl->al.offset;
+			u64 var_addr = src->offset;
+			int offset;
+
+			if (src->multi_regs) {
+				int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
+
+				if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+				    state->regs[reg2].kind == TSR_KIND_CONST)
+					var_addr += state->regs[reg2].imm_value;
+			}
+
+			/*
+			 * In kernel, %gs points to a per-cpu region for the
+			 * current CPU.  Access with a constant offset should
+			 * be treated as a global variable access.
+			 */
+			if (get_global_var_type(cu_die, dloc, ip, var_addr,
+						&offset, &type_die) &&
+			    die_get_member_type(&type_die, offset, &type_die)) {
+				tsr->type = type_die;
+				tsr->kind = TSR_KIND_TYPE;
+				tsr->ok = true;
+
+				if (src->multi_regs) {
+					pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
+						     insn_offset, src->offset, src->reg1,
+						     src->reg2, dst->reg1);
+				} else {
+					pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
+						     insn_offset, src->offset, sreg, dst->reg1);
+				}
+				pr_debug_type_name(&tsr->type, tsr->kind);
+			} else {
+				tsr->ok = false;
+			}
+		}
+		/* And then dereference the calculated pointer if it has one */
+		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
+			 state->regs[sreg].kind == TSR_KIND_POINTER &&
+			 die_get_member_type(&state->regs[sreg].type,
+					     src->offset, &type_die)) {
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d",
+				     insn_offset, src->offset, sreg, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/* Or try another register if any */
+		else if (src->multi_regs && sreg == src->reg1 &&
+			 src->reg1 != src->reg2) {
+			sreg = src->reg2;
+			goto retry;
+		}
+		else {
+			int offset;
+			const char *var_name = NULL;
+
+			/* it might be per-cpu variable (in kernel) access */
+			if (src->offset < 0) {
+				if (get_global_var_info(dloc, (s64)src->offset,
+							&var_name, &offset) &&
+				    !strcmp(var_name, "__per_cpu_offset")) {
+					tsr->kind = TSR_KIND_PERCPU_BASE;
+
+					pr_debug_dtp("mov [%x] percpu base reg%d\n",
+						     insn_offset, dst->reg1);
+				}
+			}
+
+			tsr->ok = false;
+		}
+	}
+	/* Case 3. register to memory transfers */
+	if (!src->mem_ref && dst->mem_ref) {
+		if (!has_reg_type(state, src->reg1) ||
+		    !state->regs[src->reg1].ok)
+			return;
+
+		/* Check stack variables with offset */
+		if (dst->reg1 == fbreg) {
+			struct type_state_stack *stack;
+			int offset = dst->offset - fboff;
+
+			tsr = &state->regs[src->reg1];
+
+			stack = find_stack_state(state, offset);
+			if (stack) {
+				/*
+				 * The source register is likely to hold a type
+				 * of member if it's a compound type.  Do not
+				 * update the stack variable type since we can
+				 * get the member type later by using the
+				 * die_get_member_type().
+				 */
+				if (!stack->compound)
+					set_stack_state(stack, offset, tsr->kind,
+							&tsr->type);
+			} else {
+				findnew_stack_state(state, offset, tsr->kind,
+						    &tsr->type);
+			}
+
+			pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)",
+				     insn_offset, src->reg1, -offset);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/*
+		 * Ignore other transfers since it'd set a value in a struct
+		 * and won't change the type.
+		 */
+	}
+	/* Case 4. memory to memory transfers (not handled for now) */
+}
+#endif
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index a4c7f98a75e3..7a48c3d72b89 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -39,7 +39,7 @@ do {								\
 		pr_debug3(fmt, ##__VA_ARGS__);			\
 } while (0)
 
-static void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind)
+void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind)
 {
 	struct strbuf sb;
 	char *str;
@@ -390,7 +390,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
 	return 0;
 }
 
-static struct type_state_stack *find_stack_state(struct type_state *state,
+struct type_state_stack *find_stack_state(struct type_state *state,
 						 int offset)
 {
 	struct type_state_stack *stack;
@@ -406,7 +406,7 @@ static struct type_state_stack *find_stack_state(struct type_state *state,
 	return NULL;
 }
 
-static void set_stack_state(struct type_state_stack *stack, int offset, u8 kind,
+void set_stack_state(struct type_state_stack *stack, int offset, u8 kind,
 			    Dwarf_Die *type_die)
 {
 	int tag;
@@ -433,7 +433,7 @@ static void set_stack_state(struct type_state_stack *stack, int offset, u8 kind,
 	}
 }
 
-static struct type_state_stack *findnew_stack_state(struct type_state *state,
+struct type_state_stack *findnew_stack_state(struct type_state *state,
 						    int offset, u8 kind,
 						    Dwarf_Die *type_die)
 {
@@ -537,7 +537,7 @@ void global_var_type__tree_delete(struct rb_root *root)
 	}
 }
 
-static bool get_global_var_info(struct data_loc_info *dloc, u64 addr,
+bool get_global_var_info(struct data_loc_info *dloc, u64 addr,
 				const char **var_name, int *var_offset)
 {
 	struct addr_location al;
@@ -611,7 +611,7 @@ static void global_var__collect(struct data_loc_info *dloc)
 	}
 }
 
-static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
+bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
 				u64 ip, u64 var_addr, int *var_offset,
 				Dwarf_Die *type_die)
 {
@@ -722,381 +722,6 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 	}
 }
 
-static void update_insn_state_x86(struct type_state *state,
-				  struct data_loc_info *dloc, Dwarf_Die *cu_die,
-				  struct disasm_line *dl)
-{
-	struct annotated_insn_loc loc;
-	struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE];
-	struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET];
-	struct type_state_reg *tsr;
-	Dwarf_Die type_die;
-	u32 insn_offset = dl->al.offset;
-	int fbreg = dloc->fbreg;
-	int fboff = 0;
-
-	if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0)
-		return;
-
-	if (ins__is_call(&dl->ins)) {
-		struct symbol *func = dl->ops.target.sym;
-
-		if (func == NULL)
-			return;
-
-		/* __fentry__ will preserve all registers */
-		if (!strcmp(func->name, "__fentry__"))
-			return;
-
-		pr_debug_dtp("call [%x] %s\n", insn_offset, func->name);
-
-		/* Otherwise invalidate caller-saved registers after call */
-		for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) {
-			if (state->regs[i].caller_saved)
-				state->regs[i].ok = false;
-		}
-
-		/* Update register with the return type (if any) */
-		if (die_find_func_rettype(cu_die, func->name, &type_die)) {
-			tsr = &state->regs[state->ret_reg];
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->ok = true;
-
-			pr_debug_dtp("call [%x] return -> reg%d",
-				     insn_offset, state->ret_reg);
-			pr_debug_type_name(&type_die, tsr->kind);
-		}
-		return;
-	}
-
-	if (!strncmp(dl->ins.name, "add", 3)) {
-		u64 imm_value = -1ULL;
-		int offset;
-		const char *var_name = NULL;
-		struct map_symbol *ms = dloc->ms;
-		u64 ip = ms->sym->start + dl->al.offset;
-
-		if (!has_reg_type(state, dst->reg1))
-			return;
-
-		tsr = &state->regs[dst->reg1];
-
-		if (src->imm)
-			imm_value = src->offset;
-		else if (has_reg_type(state, src->reg1) &&
-			 state->regs[src->reg1].kind == TSR_KIND_CONST)
-			imm_value = state->regs[src->reg1].imm_value;
-		else if (src->reg1 == DWARF_REG_PC) {
-			u64 var_addr = annotate_calc_pcrel(dloc->ms, ip,
-							   src->offset, dl);
-
-			if (get_global_var_info(dloc, var_addr,
-						&var_name, &offset) &&
-			    !strcmp(var_name, "this_cpu_off") &&
-			    tsr->kind == TSR_KIND_CONST) {
-				tsr->kind = TSR_KIND_PERCPU_BASE;
-				imm_value = tsr->imm_value;
-			}
-		}
-		else
-			return;
-
-		if (tsr->kind != TSR_KIND_PERCPU_BASE)
-			return;
-
-		if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset,
-					&type_die) && offset == 0) {
-			/*
-			 * This is not a pointer type, but it should be treated
-			 * as a pointer.
-			 */
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_POINTER;
-			tsr->ok = true;
-
-			pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d",
-				     insn_offset, imm_value, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		return;
-	}
-
-	if (strncmp(dl->ins.name, "mov", 3))
-		return;
-
-	if (dloc->fb_cfa) {
-		u64 ip = dloc->ms->sym->start + dl->al.offset;
-		u64 pc = map__rip_2objdump(dloc->ms->map, ip);
-
-		if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0)
-			fbreg = -1;
-	}
-
-	/* Case 1. register to register or segment:offset to register transfers */
-	if (!src->mem_ref && !dst->mem_ref) {
-		if (!has_reg_type(state, dst->reg1))
-			return;
-
-		tsr = &state->regs[dst->reg1];
-		if (dso__kernel(map__dso(dloc->ms->map)) &&
-		    src->segment == INSN_SEG_X86_GS && src->imm) {
-			u64 ip = dloc->ms->sym->start + dl->al.offset;
-			u64 var_addr;
-			int offset;
-
-			/*
-			 * In kernel, %gs points to a per-cpu region for the
-			 * current CPU.  Access with a constant offset should
-			 * be treated as a global variable access.
-			 */
-			var_addr = src->offset;
-
-			if (var_addr == 40) {
-				tsr->kind = TSR_KIND_CANARY;
-				tsr->ok = true;
-
-				pr_debug_dtp("mov [%x] stack canary -> reg%d\n",
-					     insn_offset, dst->reg1);
-				return;
-			}
-
-			if (!get_global_var_type(cu_die, dloc, ip, var_addr,
-						 &offset, &type_die) ||
-			    !die_get_member_type(&type_die, offset, &type_die)) {
-				tsr->ok = false;
-				return;
-			}
-
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d",
-				     insn_offset, var_addr, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-			return;
-		}
-
-		if (src->imm) {
-			tsr->kind = TSR_KIND_CONST;
-			tsr->imm_value = src->offset;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n",
-				     insn_offset, tsr->imm_value, dst->reg1);
-			return;
-		}
-
-		if (!has_reg_type(state, src->reg1) ||
-		    !state->regs[src->reg1].ok) {
-			tsr->ok = false;
-			return;
-		}
-
-		tsr->type = state->regs[src->reg1].type;
-		tsr->kind = state->regs[src->reg1].kind;
-		tsr->ok = true;
-
-		pr_debug_dtp("mov [%x] reg%d -> reg%d",
-			     insn_offset, src->reg1, dst->reg1);
-		pr_debug_type_name(&tsr->type, tsr->kind);
-	}
-	/* Case 2. memory to register transers */
-	if (src->mem_ref && !dst->mem_ref) {
-		int sreg = src->reg1;
-
-		if (!has_reg_type(state, dst->reg1))
-			return;
-
-		tsr = &state->regs[dst->reg1];
-
-retry:
-		/* Check stack variables with offset */
-		if (sreg == fbreg) {
-			struct type_state_stack *stack;
-			int offset = src->offset - fboff;
-
-			stack = find_stack_state(state, offset);
-			if (stack == NULL) {
-				tsr->ok = false;
-				return;
-			} else if (!stack->compound) {
-				tsr->type = stack->type;
-				tsr->kind = stack->kind;
-				tsr->ok = true;
-			} else if (die_get_member_type(&stack->type,
-						       offset - stack->offset,
-						       &type_die)) {
-				tsr->type = type_die;
-				tsr->kind = TSR_KIND_TYPE;
-				tsr->ok = true;
-			} else {
-				tsr->ok = false;
-				return;
-			}
-
-			pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d",
-				     insn_offset, -offset, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/* And then dereference the pointer if it has one */
-		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
-			 state->regs[sreg].kind == TSR_KIND_TYPE &&
-			 die_deref_ptr_type(&state->regs[sreg].type,
-					    src->offset, &type_die)) {
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d",
-				     insn_offset, src->offset, sreg, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/* Or check if it's a global variable */
-		else if (sreg == DWARF_REG_PC) {
-			struct map_symbol *ms = dloc->ms;
-			u64 ip = ms->sym->start + dl->al.offset;
-			u64 addr;
-			int offset;
-
-			addr = annotate_calc_pcrel(ms, ip, src->offset, dl);
-
-			if (!get_global_var_type(cu_die, dloc, ip, addr, &offset,
-						 &type_die) ||
-			    !die_get_member_type(&type_die, offset, &type_die)) {
-				tsr->ok = false;
-				return;
-			}
-
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d",
-				     insn_offset, addr, dst->reg1);
-			pr_debug_type_name(&type_die, tsr->kind);
-		}
-		/* And check percpu access with base register */
-		else if (has_reg_type(state, sreg) &&
-			 state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
-			u64 ip = dloc->ms->sym->start + dl->al.offset;
-			u64 var_addr = src->offset;
-			int offset;
-
-			if (src->multi_regs) {
-				int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
-
-				if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
-				    state->regs[reg2].kind == TSR_KIND_CONST)
-					var_addr += state->regs[reg2].imm_value;
-			}
-
-			/*
-			 * In kernel, %gs points to a per-cpu region for the
-			 * current CPU.  Access with a constant offset should
-			 * be treated as a global variable access.
-			 */
-			if (get_global_var_type(cu_die, dloc, ip, var_addr,
-						&offset, &type_die) &&
-			    die_get_member_type(&type_die, offset, &type_die)) {
-				tsr->type = type_die;
-				tsr->kind = TSR_KIND_TYPE;
-				tsr->ok = true;
-
-				if (src->multi_regs) {
-					pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
-						     insn_offset, src->offset, src->reg1,
-						     src->reg2, dst->reg1);
-				} else {
-					pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
-						     insn_offset, src->offset, sreg, dst->reg1);
-				}
-				pr_debug_type_name(&tsr->type, tsr->kind);
-			} else {
-				tsr->ok = false;
-			}
-		}
-		/* And then dereference the calculated pointer if it has one */
-		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
-			 state->regs[sreg].kind == TSR_KIND_POINTER &&
-			 die_get_member_type(&state->regs[sreg].type,
-					     src->offset, &type_die)) {
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d",
-				     insn_offset, src->offset, sreg, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/* Or try another register if any */
-		else if (src->multi_regs && sreg == src->reg1 &&
-			 src->reg1 != src->reg2) {
-			sreg = src->reg2;
-			goto retry;
-		}
-		else {
-			int offset;
-			const char *var_name = NULL;
-
-			/* it might be per-cpu variable (in kernel) access */
-			if (src->offset < 0) {
-				if (get_global_var_info(dloc, (s64)src->offset,
-							&var_name, &offset) &&
-				    !strcmp(var_name, "__per_cpu_offset")) {
-					tsr->kind = TSR_KIND_PERCPU_BASE;
-
-					pr_debug_dtp("mov [%x] percpu base reg%d\n",
-						     insn_offset, dst->reg1);
-				}
-			}
-
-			tsr->ok = false;
-		}
-	}
-	/* Case 3. register to memory transfers */
-	if (!src->mem_ref && dst->mem_ref) {
-		if (!has_reg_type(state, src->reg1) ||
-		    !state->regs[src->reg1].ok)
-			return;
-
-		/* Check stack variables with offset */
-		if (dst->reg1 == fbreg) {
-			struct type_state_stack *stack;
-			int offset = dst->offset - fboff;
-
-			tsr = &state->regs[src->reg1];
-
-			stack = find_stack_state(state, offset);
-			if (stack) {
-				/*
-				 * The source register is likely to hold a type
-				 * of member if it's a compound type.  Do not
-				 * update the stack variable type since we can
-				 * get the member type later by using the
-				 * die_get_member_type().
-				 */
-				if (!stack->compound)
-					set_stack_state(stack, offset, tsr->kind,
-							&tsr->type);
-			} else {
-				findnew_stack_state(state, offset, tsr->kind,
-						    &tsr->type);
-			}
-
-			pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)",
-				     insn_offset, src->reg1, -offset);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/*
-		 * Ignore other transfers since it'd set a value in a struct
-		 * and won't change the type.
-		 */
-	}
-	/* Case 4. memory to memory transfers (not handled for now) */
-}
-
 /**
  * update_insn_state - Update type state for an instruction
  * @state: type state table
@@ -1115,8 +740,8 @@ retry:
 static void update_insn_state(struct type_state *state, struct data_loc_info *dloc,
 			      Dwarf_Die *cu_die, struct disasm_line *dl)
 {
-	if (arch__is(dloc->arch, "x86"))
-		update_insn_state_x86(state, dloc, cu_die, dl);
+	if (dloc->arch->update_insn_state)
+		dloc->arch->update_insn_state(state, dloc, cu_die, dl);
 }
 
 /*
diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index cdb5cd8960bb..6fe8ee8b8410 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -6,6 +6,7 @@
 #include <linux/compiler.h>
 #include <linux/rbtree.h>
 #include <linux/types.h>
+#include "dwarf-regs.h"
 #include "annotate.h"
 
 #ifdef HAVE_DWARF_SUPPORT
@@ -20,6 +21,14 @@ struct hist_entry;
 struct map_symbol;
 struct thread;
 
+#define pr_debug_dtp(fmt, ...)					\
+do {								\
+	if (debug_type_profile)					\
+		pr_info(fmt, ##__VA_ARGS__);			\
+	else							\
+		pr_debug3(fmt, ##__VA_ARGS__);			\
+} while (0)
+
 enum type_state_kind {
 	TSR_KIND_INVALID = 0,
 	TSR_KIND_TYPE,
@@ -216,6 +225,20 @@ void global_var_type__tree_delete(struct rb_root *root);
 int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel);
 
 bool has_reg_type(struct type_state *state, int reg);
+struct type_state_stack *findnew_stack_state(struct type_state *state,
+						int offset, u8 kind,
+						Dwarf_Die *type_die);
+void set_stack_state(struct type_state_stack *stack, int offset, u8 kind,
+				Dwarf_Die *type_die);
+struct type_state_stack *find_stack_state(struct type_state *state,
+						int offset);
+bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
+				u64 ip, u64 var_addr, int *var_offset,
+				Dwarf_Die *type_die);
+bool get_global_var_info(struct data_loc_info *dloc, u64 addr,
+				const char **var_name, int *var_offset);
+void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind);
+
 #else /* HAVE_DWARF_SUPPORT */
 
 static inline struct annotated_data_type *
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index e10558b79504..931cd92dcc40 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -12,6 +12,7 @@
 #include <subcmd/run-command.h>
 
 #include "annotate.h"
+#include "annotate-data.h"
 #include "build-id.h"
 #include "debug.h"
 #include "disasm.h"
@@ -145,6 +146,9 @@ static struct arch architectures[] = {
 			.memory_ref_char = '(',
 			.imm_char = '$',
 		},
+#ifdef HAVE_DWARF_SUPPORT
+		.update_insn_state = update_insn_state_x86,
+#endif
 	},
 	{
 		.name = "powerpc",
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index 3d381a043520..c835759c8e2b 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -4,11 +4,18 @@
 
 #include "map_symbol.h"
 
+#ifdef HAVE_DWARF_SUPPORT
+#include "dwarf-aux.h"
+#endif
+
 struct annotation_options;
 struct disasm_line;
 struct ins;
 struct evsel;
 struct symbol;
+struct data_loc_info;
+struct type_state;
+struct disasm_line;
 
 struct arch {
 	const char	*name;
@@ -32,6 +39,11 @@ struct arch {
 		char memory_ref_char;
 		char imm_char;
 	} objdump;
+#ifdef HAVE_DWARF_SUPPORT
+	void		(*update_insn_state)(struct type_state *state,
+				struct data_loc_info *dloc, Dwarf_Die *cu_die,
+				struct disasm_line *dl);
+#endif
 };
 
 struct ins {
-- 
cgit v1.2.3


From b1d8d968a7983e9756de34b8bcaa24cc339828ed Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:46 +0530
Subject: perf annotate: Update TYPE_STATE_MAX_REGS to include max of regs in
 powerpc

TYPE_STATE_MAX_REGS is arch-dependent. Currently this is defined to be
16.

While checking if reg is valid using has_reg_type, max value is checked
using TYPE_STATE_MAX_REGS value.

Define this conditionally for powerpc.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-4-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index 6fe8ee8b8410..992b7ce4bd11 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -189,7 +189,11 @@ struct type_state_stack {
 };
 
 /* FIXME: This should be arch-dependent */
+#ifdef __powerpc__
+#define TYPE_STATE_MAX_REGS  32
+#else
 #define TYPE_STATE_MAX_REGS  16
+#endif
 
 /*
  * State table to maintain type info in each register and stack location.
-- 
cgit v1.2.3


From 06dd4c5a561c48c66745352bae0b2c04bbe455be Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:47 +0530
Subject: perf annotate: Add disasm_line__parse() to parse raw instruction for
 powerpc

Currently, the perf tool infrastructure uses the disasm_line__parse
function to parse disassembled line.

Example snippet from objdump:

  objdump  --start-address=<address> --stop-address=<address>  -d --no-show-raw-insn -C <vmlinux>

  c0000000010224b4:	lwz     r10,0(r9)

This line "lwz r10,0(r9)" is parsed to extract instruction name,
registers names and offset.

In powerpc, the approach for data type profiling uses raw instruction
instead of result from objdump to identify the instruction category and
extract the source/target registers.

Example: 38 01 81 e8     ld      r4,312(r1)

Here "38 01 81 e8" is the raw instruction representation. Add function
"disasm_line__parse_powerpc" to handle parsing of raw instruction.
Also update "struct disasm_line" to save the binary code/
With the change, function captures:

line -> "38 01 81 e8     ld      r4,312(r1)"
raw instruction "38 01 81 e8"

Raw instruction is used later to extract the reg/offset fields. Macros
are added to extract opcode and register fields. "struct disasm_line"
is updated to carry union of "bytes" and "raw_insn" of 32 bit to carry raw
code (raw).

Function "disasm_line__parse_powerpc fills the raw instruction hex value
and can use macros to get opcode. There is no changes in existing code
paths, which parses the disassembled code.  The size of raw instruction
depends on architecture.

In case of powerpc, the parsing the disasm line needs to handle cases
for reading binary code directly from DSO as well as parsing the objdump
result. Hence adding the logic into separate function instead of
updating "disasm_line__parse".  The architecture using the instruction
name and present approach is not altered. Since this approach targets
powerpc, the macro implementation is added for powerpc as of now.

Since the disasm_line__parse is used in other cases (perf annotate) and
not only data tye profiling, the powerpc callback includes changes to
work with binary code as well as mnemonic representation.

Also in case if the DSO read fails and libcapstone is not supported, the
approach fallback to use objdump as option. Hence as option, patch has
changes to ensure objdump option also works well.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-5-atrajeev@linux.vnet.ibm.com
[ Add check for strndup() result ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/string.h                    |  2 +
 tools/lib/string.c                              | 13 +++++++
 tools/perf/arch/powerpc/annotate/instructions.c |  1 +
 tools/perf/arch/powerpc/util/dwarf-regs.c       |  9 +++++
 tools/perf/util/annotate.h                      |  5 ++-
 tools/perf/util/disasm.c                        | 51 ++++++++++++++++++++++++-
 6 files changed, 79 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
index db5c99318c79..0acb1fc14e19 100644
--- a/tools/include/linux/string.h
+++ b/tools/include/linux/string.h
@@ -46,5 +46,7 @@ extern char * __must_check skip_spaces(const char *);
 
 extern char *strim(char *);
 
+extern void remove_spaces(char *s);
+
 extern void *memchr_inv(const void *start, int c, size_t bytes);
 #endif /* _TOOLS_LINUX_STRING_H_ */
diff --git a/tools/lib/string.c b/tools/lib/string.c
index 8b6892f959ab..3126d2cff716 100644
--- a/tools/lib/string.c
+++ b/tools/lib/string.c
@@ -153,6 +153,19 @@ char *strim(char *s)
 	return skip_spaces(s);
 }
 
+/*
+ * remove_spaces - Removes whitespaces from @s
+ */
+void remove_spaces(char *s)
+{
+	char *d = s;
+
+	do {
+		while (*d == ' ')
+			++d;
+	} while ((*s++ = *d++));
+}
+
 /**
  * strreplace - Replace all occurrences of character in string.
  * @s: The string to operate on.
diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
index a3f423c27cae..d57fd023ef9c 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -55,6 +55,7 @@ static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
 		arch->initialized = true;
 		arch->associate_instruction_ops = powerpc__associate_instruction_ops;
 		arch->objdump.comment_char      = '#';
+		annotate_opts.show_asm_raw = true;
 	}
 
 	return 0;
diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c
index 0c4f4caf53ac..430623ca5612 100644
--- a/tools/perf/arch/powerpc/util/dwarf-regs.c
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
@@ -98,3 +98,12 @@ int regs_query_register_offset(const char *name)
 			return roff->ptregs_offset;
 	return -EINVAL;
 }
+
+#define PPC_OP(op)	(((op) >> 26) & 0x3F)
+#define PPC_RA(a)	(((a) >> 16) & 0x1f)
+#define PPC_RT(t)	(((t) >> 21) & 0x1f)
+#define PPC_RB(b)	(((b) >> 11) & 0x1f)
+#define PPC_D(D)	((D) & 0xfffe)
+#define PPC_DS(DS)	((DS) & 0xfffc)
+#define OP_LD	58
+#define OP_STD	62
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index d5c821c22f79..9ba772f46270 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -113,7 +113,10 @@ struct annotation_line {
 struct disasm_line {
 	struct ins		 ins;
 	struct ins_operands	 ops;
-
+	union {
+		u8 bytes[4];
+		u32 raw_insn;
+	} raw;
 	/* This needs to be at the end. */
 	struct annotation_line	 al;
 };
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 931cd92dcc40..6d4055b9b966 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -44,6 +44,7 @@ static int call__scnprintf(struct ins *ins, char *bf, size_t size,
 
 static void ins__sort(struct arch *arch);
 static int disasm_line__parse(char *line, const char **namep, char **rawp);
+static int disasm_line__parse_powerpc(struct disasm_line *dl);
 
 static __attribute__((constructor)) void symbol__init_regexpr(void)
 {
@@ -845,6 +846,51 @@ out:
 	return -1;
 }
 
+/*
+ * Parses the result captured from symbol__disassemble_*
+ * Example, line read from DSO file in powerpc:
+ * line:    38 01 81 e8
+ * opcode: fetched from arch specific get_opcode_insn
+ * rawp_insn: e8810138
+ *
+ * rawp_insn is used later to extract the reg/offset fields
+ */
+#define	PPC_OP(op)	(((op) >> 26) & 0x3F)
+#define	RAW_BYTES	11
+
+static int disasm_line__parse_powerpc(struct disasm_line *dl)
+{
+	char *line = dl->al.line;
+	const char **namep = &dl->ins.name;
+	char **rawp = &dl->ops.raw;
+	char *tmp_raw_insn, *name_raw_insn = skip_spaces(line);
+	char *name = skip_spaces(name_raw_insn + RAW_BYTES);
+	int objdump = 0;
+
+	if (strlen(line) > RAW_BYTES)
+		objdump = 1;
+
+	if (name_raw_insn[0] == '\0')
+		return -1;
+
+	if (objdump) {
+		disasm_line__parse(name, namep, rawp);
+	} else
+		*namep = "";
+
+	tmp_raw_insn = strndup(name_raw_insn, 11);
+	if (tmp_raw_insn == NULL)
+		return -1;
+
+	remove_spaces(tmp_raw_insn);
+
+	sscanf(tmp_raw_insn, "%x", &dl->raw.raw_insn);
+	if (objdump)
+		dl->raw.raw_insn = be32_to_cpu(dl->raw.raw_insn);
+
+	return 0;
+}
+
 static void annotation_line__init(struct annotation_line *al,
 				  struct annotate_args *args,
 				  int nr)
@@ -898,7 +944,10 @@ struct disasm_line *disasm_line__new(struct annotate_args *args)
 		goto out_delete;
 
 	if (args->offset != -1) {
-		if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0)
+		if (arch__is(args->arch, "powerpc")) {
+			if (disasm_line__parse_powerpc(dl) < 0)
+				goto out_free_line;
+		} else if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0)
 			goto out_free_line;
 
 		disasm_line__init_ins(dl, args->arch, &args->ms);
-- 
cgit v1.2.3


From 0b971e6bf1c305843a034ca762718b9428a5dd7f Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:48 +0530
Subject: perf annotate: Add support to capture and parse raw instruction in
 powerpc using dso__data_read_offset utility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support to capture and parse raw instruction in powerpc.
Currently, the perf tool infrastructure uses two ways to disassemble
and understand the instruction. One is objdump and other option is
via libcapstone.

Currently, the perf tool infrastructure uses "--no-show-raw-insn" option
with "objdump" while disassemble. Example from powerpc with this option
for an instruction address is:

Snippet from:

  objdump  --start-address=<address> --stop-address=<address>  -d --no-show-raw-insn -C <vmlinux>

  c0000000010224b4:	lwz     r10,0(r9)

This line "lwz r10,0(r9)" is parsed to extract instruction name,
registers names and offset. Also to find whether there is a memory
reference in the operands, "memory_ref_char" field of objdump is used.
For x86, "(" is used as memory_ref_char to tackle instructions of the
form "mov  (%rax), %rcx".

In case of powerpc, not all instructions using "(" are the only memory
instructions. Example, above instruction can also be of extended form (X
form) "lwzx r10,0,r19". Inorder to easy identify the instruction category
and extract the source/target registers, patch adds support to use raw
instruction for powerpc. Approach used is to read the raw instruction
directly from the DSO file using "dso__data_read_offset" utility which
is already implemented in perf infrastructure in "util/dso.c".

Example:

38 01 81 e8     ld      r4,312(r1)

Here "38 01 81 e8" is the raw instruction representation. In powerpc,
this translates to instruction form: "ld RT,DS(RA)" and binary code
as:

   | 58 |  RT  |  RA |      DS       | |
   -------------------------------------
   0    6     11    16              30 31

Function "symbol__disassemble_dso" is updated to read raw instruction
directly from DSO using dso__data_read_offset utility. In case of
above example, this captures:
line:    38 01 81 e8

The above works well when 'perf report' is invoked with only sort keys
for data type ie type and typeoff.

Because there is no instruction level annotation needed if only data
type information is requested for.

For annotating sample, along with type and typeoff sort key, "sym" sort
key is also needed. And by default invoking just "perf report" uses sort
key "sym" that displays the symbol information.

With approach changes in powerpc which first reads DSO for raw
instruction, "perf annotate" and "perf report" + a key breaks since
it doesn't do the instruction level disassembly.

Snippet of result from 'perf report':

  Samples: 1K of event 'mem-loads', 4000 Hz, Event count (approx.): 937238
  do_work  /usr/bin/pmlogger [Percent: local period]
  Percent│        ea230010
         │        3a550010
         │        3a600000

         │        38f60001
         │        39490008
         │        42400438
   51.44 │        81290008
         │        7d485378

Here, raw instruction is displayed in the output instead of human
readable annotated form.

One way to get the appropriate data is to specify "--objdump path", by
which code annotation will be done. But the default behaviour will be
changed. To fix this breakage, check if "sym" sort key is set. If so
fallback and use the libcapstone/objdump way of disassmbling the sample.

With the changes and "perf report"

Samples: 1K of event 'mem-loads', 4000 Hz, Event count (approx.): 937238
do_work  /usr/bin/pmlogger [Percent: local period]
Percent│        ld        r17,16(r3)
       │        addi      r18,r21,16
       │        li        r19,0

       │ 8b0:   rldicl    r10,r10,63,33
       │        addi      r10,r10,1
       │        mtctr     r10
       │      ↓ b         8e4
       │ 8c0:   addi      r7,r22,1
       │        addi      r10,r9,8
       │      ↓ bdz       d00
 51.44 │        lwz       r9,8(r9)
       │        mr        r8,r10
       │        cmpw      r20,r9

Committer notes:

Just add the extern for 'sort_order' in disasm.c so that we don't end up
breaking the build due to this type colision with capstone and libbpf:

  In file included from /usr/include/capstone/capstone.h:325,
                   from /git/perf-6.10.0/tools/perf/util/print_insn.h:23,
                   from builtin-script.c:38:
  /usr/include/capstone/bpf.h:94:14: error: 'bpf_insn' defined as wrong kind of tag
     94 | typedef enum bpf_insn {

I reported this to the bpf mailing list, see one of the links below.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-6-atrajeev@linux.vnet.ibm.com
Link: https://lore.kernel.org/bpf/ZqOltPk9VQGgJZAA@x1/T/#u
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 6d4055b9b966..bab15cce612f 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -1637,6 +1637,91 @@ err:
 }
 #endif
 
+static int symbol__disassemble_raw(char *filename, struct symbol *sym,
+					struct annotate_args *args)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct map *map = args->ms.map;
+	struct dso *dso = map__dso(map);
+	u64 start = map__rip_2objdump(map, sym->start);
+	u64 end = map__rip_2objdump(map, sym->end);
+	u64 len = end - start;
+	u64 offset;
+	int i, count;
+	u8 *buf = NULL;
+	char disasm_buf[512];
+	struct disasm_line *dl;
+	u32 *line;
+
+	/* Return if objdump is specified explicitly */
+	if (args->options->objdump_path)
+		return -1;
+
+	pr_debug("Reading raw instruction from : %s using dso__data_read_offset\n", filename);
+
+	buf = malloc(len);
+	if (buf == NULL)
+		goto err;
+
+	count = dso__data_read_offset(dso, NULL, sym->start, buf, len);
+
+	line = (u32 *)buf;
+
+	if ((u64)count != len)
+		goto err;
+
+	/* add the function address and name */
+	scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:",
+		  start, sym->name);
+
+	args->offset = -1;
+	args->line = disasm_buf;
+	args->line_nr = 0;
+	args->fileloc = NULL;
+	args->ms.sym = sym;
+
+	dl = disasm_line__new(args);
+	if (dl == NULL)
+		goto err;
+
+	annotation_line__add(&dl->al, &notes->src->source);
+
+	/* Each raw instruction is 4 byte */
+	count = len/4;
+
+	for (i = 0, offset = 0; i < count; i++) {
+		args->offset = offset;
+		sprintf(args->line, "%x", line[i]);
+		dl = disasm_line__new(args);
+		if (dl == NULL)
+			goto err;
+
+		annotation_line__add(&dl->al, &notes->src->source);
+		offset += 4;
+	}
+
+	/* It failed in the middle */
+	if (offset != len) {
+		struct list_head *list = &notes->src->source;
+
+		/* Discard all lines and fallback to objdump */
+		while (!list_empty(list)) {
+			dl = list_first_entry(list, struct disasm_line, al.node);
+
+			list_del_init(&dl->al.node);
+			disasm_line__free(dl);
+		}
+		count = -1;
+	}
+
+out:
+	free(buf);
+	return count < 0 ? count : 0;
+
+err:
+	count = -1;
+	goto out;
+}
 /*
  * Possibly create a new version of line with tabs expanded. Returns the
  * existing or new line, storage is updated if a new line is allocated. If
@@ -1761,6 +1846,23 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 		strcpy(symfs_filename, tmp);
 	}
 
+	/*
+	 * For powerpc data type profiling, use the dso__data_read_offset
+	 * to read raw instruction directly and interpret the binary code
+	 * to understand instructions and register fields. For sort keys as
+	 * type and typeoff, disassemble to mnemonic notation is
+	 * not required in case of powerpc.
+	 */
+	if (arch__is(args->arch, "powerpc")) {
+		extern const char *sort_order;
+
+		if (sort_order && !strstr(sort_order, "sym")) {
+			err = symbol__disassemble_raw(symfs_filename, sym, args);
+			if (err == 0)
+				goto out_remove_tmp;
+		}
+	}
+
 #ifdef HAVE_LIBCAPSTONE_SUPPORT
 	err = symbol__disassemble_capstone(symfs_filename, sym, args);
 	if (err == 0)
-- 
cgit v1.2.3


From 1b4406d2a88cf65d615b0bcb42525d7a4723eb70 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:49 +0530
Subject: perf annotate: Update parameters for reg extract functions to use raw
 instruction on powerpc

Use the raw instruction code and macros to identify memory instructions,
extract register fields and also offset.

The implementation addresses the D-form, X-form, DS-form instructions.

Adds "mem_ref" field to check whether source/target has memory
reference.

Add function "get_powerpc_regs" which will set these fields: reg1, reg2,
offset depending of where it is source or target ops.

Update "parse" callback for "struct ins_ops" to also pass "struct
disasm_line" as argument. This is needed in parse functions where opcode
is used to determine whether to set multi_regs and other fields

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-7-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/annotate/instructions.c     |  3 +-
 tools/perf/arch/loongarch/annotate/instructions.c |  6 ++--
 tools/perf/arch/powerpc/util/dwarf-regs.c         | 44 +++++++++++++++++++++++
 tools/perf/arch/s390/annotate/instructions.c      |  5 +--
 tools/perf/util/annotate.c                        | 19 ++++++++--
 tools/perf/util/disasm.c                          | 19 ++++++----
 tools/perf/util/disasm.h                          |  5 ++-
 tools/perf/util/include/dwarf-regs.h              | 11 ++++++
 8 files changed, 96 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c
index 4af0c3a0f86e..f86d9f4798bd 100644
--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ b/tools/perf/arch/arm64/annotate/instructions.c
@@ -11,7 +11,8 @@ struct arm64_annotate {
 
 static int arm64_mov__parse(struct arch *arch __maybe_unused,
 			    struct ins_operands *ops,
-			    struct map_symbol *ms __maybe_unused)
+			    struct map_symbol *ms __maybe_unused,
+			    struct disasm_line *dl __maybe_unused)
 {
 	char *s = strchr(ops->raw, ','), *target, *endptr;
 
diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c
index 21cc7e4149f7..ab43b1ab51e3 100644
--- a/tools/perf/arch/loongarch/annotate/instructions.c
+++ b/tools/perf/arch/loongarch/annotate/instructions.c
@@ -5,7 +5,8 @@
  * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
  */
 
-static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms)
+static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+		struct disasm_line *dl __maybe_unused)
 {
 	char *c, *endptr, *tok, *name;
 	struct map *map = ms->map;
@@ -51,7 +52,8 @@ static struct ins_ops loongarch_call_ops = {
 	.scnprintf = call__scnprintf,
 };
 
-static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms)
+static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+		struct disasm_line *dl __maybe_unused)
 {
 	struct map *map = ms->map;
 	struct symbol *sym = ms->sym;
diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c
index 430623ca5612..104c7ae5c433 100644
--- a/tools/perf/arch/powerpc/util/dwarf-regs.c
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
@@ -107,3 +107,47 @@ int regs_query_register_offset(const char *name)
 #define PPC_DS(DS)	((DS) & 0xfffc)
 #define OP_LD	58
 #define OP_STD	62
+
+static int get_source_reg(u32 raw_insn)
+{
+	return PPC_RA(raw_insn);
+}
+
+static int get_target_reg(u32 raw_insn)
+{
+	return PPC_RT(raw_insn);
+}
+
+static int get_offset_opcode(u32 raw_insn)
+{
+	int opcode = PPC_OP(raw_insn);
+
+	/* DS- form */
+	if ((opcode == OP_LD) || (opcode == OP_STD))
+		return PPC_DS(raw_insn);
+	else
+		return PPC_D(raw_insn);
+}
+
+/*
+ * Fills the required fields for op_loc depending on if it
+ * is a source or target.
+ * D form: ins RT,D(RA) -> src_reg1 = RA, offset = D, dst_reg1 = RT
+ * DS form: ins RT,DS(RA) -> src_reg1 = RA, offset = DS, dst_reg1 = RT
+ * X form: ins RT,RA,RB -> src_reg1 = RA, src_reg2 = RB, dst_reg1 = RT
+ */
+void get_powerpc_regs(u32 raw_insn, int is_source,
+		struct annotated_op_loc *op_loc)
+{
+	if (is_source)
+		op_loc->reg1 = get_source_reg(raw_insn);
+	else
+		op_loc->reg1 = get_target_reg(raw_insn);
+
+	if (op_loc->multi_regs)
+		op_loc->reg2 = PPC_RB(raw_insn);
+
+	/* TODO: Implement offset handling for X Form */
+	if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31))
+		op_loc->offset = get_offset_opcode(raw_insn);
+}
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c
index da5aa3e1f04c..eeac25cca699 100644
--- a/tools/perf/arch/s390/annotate/instructions.c
+++ b/tools/perf/arch/s390/annotate/instructions.c
@@ -2,7 +2,7 @@
 #include <linux/compiler.h>
 
 static int s390_call__parse(struct arch *arch, struct ins_operands *ops,
-			    struct map_symbol *ms)
+			    struct map_symbol *ms, struct disasm_line *dl __maybe_unused)
 {
 	char *endptr, *tok, *name;
 	struct map *map = ms->map;
@@ -52,7 +52,8 @@ static struct ins_ops s390_call_ops = {
 
 static int s390_mov__parse(struct arch *arch __maybe_unused,
 			   struct ins_operands *ops,
-			   struct map_symbol *ms __maybe_unused)
+			   struct map_symbol *ms __maybe_unused,
+			   struct disasm_line *dl __maybe_unused)
 {
 	char *s = strchr(ops->raw, ','), *target, *endptr;
 
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 1451caf25e77..ce99db291c5e 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2123,20 +2123,33 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
 	for_each_insn_op_loc(loc, i, op_loc) {
 		const char *insn_str = ops->source.raw;
 		bool multi_regs = ops->source.multi_regs;
+		bool mem_ref = ops->source.mem_ref;
 
 		if (i == INSN_OP_TARGET) {
 			insn_str = ops->target.raw;
 			multi_regs = ops->target.multi_regs;
+			mem_ref = ops->target.mem_ref;
 		}
 
 		/* Invalidate the register by default */
 		op_loc->reg1 = -1;
 		op_loc->reg2 = -1;
 
-		if (insn_str == NULL)
-			continue;
+		if (insn_str == NULL) {
+			if (!arch__is(arch, "powerpc"))
+				continue;
+		}
 
-		if (strchr(insn_str, arch->objdump.memory_ref_char)) {
+		/*
+		 * For powerpc, call get_powerpc_regs function which extracts the
+		 * required fields for op_loc, ie reg1, reg2, offset from the
+		 * raw instruction.
+		 */
+		if (arch__is(arch, "powerpc")) {
+			op_loc->mem_ref = mem_ref;
+			op_loc->multi_regs = multi_regs;
+			get_powerpc_regs(dl->raw.raw_insn, !i, op_loc);
+		} else if (strchr(insn_str, arch->objdump.memory_ref_char)) {
 			op_loc->mem_ref = true;
 			op_loc->multi_regs = multi_regs;
 			extract_reg_offset(arch, insn_str, op_loc);
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index bab15cce612f..3178dc9bedca 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -255,7 +255,8 @@ bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2)
 	return arch->ins_is_fused(arch, ins1, ins2);
 }
 
-static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms)
+static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+		struct disasm_line *dl __maybe_unused)
 {
 	char *endptr, *tok, *name;
 	struct map *map = ms->map;
@@ -350,7 +351,8 @@ static inline const char *validate_comma(const char *c, struct ins_operands *ops
 	return c;
 }
 
-static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms)
+static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+		struct disasm_line *dl __maybe_unused)
 {
 	struct map *map = ms->map;
 	struct symbol *sym = ms->sym;
@@ -509,7 +511,8 @@ static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep)
 	return 0;
 }
 
-static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms)
+static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+		struct disasm_line *dl __maybe_unused)
 {
 	ops->locked.ops = zalloc(sizeof(*ops->locked.ops));
 	if (ops->locked.ops == NULL)
@@ -524,7 +527,7 @@ static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 		goto out_free_ops;
 
 	if (ops->locked.ins.ops->parse &&
-	    ops->locked.ins.ops->parse(arch, ops->locked.ops, ms) < 0)
+	    ops->locked.ins.ops->parse(arch, ops->locked.ops, ms, NULL) < 0)
 		goto out_free_ops;
 
 	return 0;
@@ -595,7 +598,8 @@ static bool check_multi_regs(struct arch *arch, const char *op)
 	return count > 1;
 }
 
-static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused)
+static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused,
+		struct disasm_line *dl __maybe_unused)
 {
 	char *s = strchr(ops->raw, ','), *target, *comment, prev;
 
@@ -673,7 +677,8 @@ static struct ins_ops mov_ops = {
 	.scnprintf = mov__scnprintf,
 };
 
-static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused)
+static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused,
+		struct disasm_line *dl __maybe_unused)
 {
 	char *target, *comment, *s, prev;
 
@@ -814,7 +819,7 @@ static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, str
 	if (!dl->ins.ops)
 		return;
 
-	if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, ms) < 0)
+	if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, ms, dl) < 0)
 		dl->ins.ops = NULL;
 }
 
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index c835759c8e2b..30be0a94ea04 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -62,6 +62,7 @@ struct ins_operands {
 		bool	offset_avail;
 		bool	outside;
 		bool	multi_regs;
+		bool	mem_ref;
 	} target;
 	union {
 		struct {
@@ -69,6 +70,7 @@ struct ins_operands {
 			char	*name;
 			u64	addr;
 			bool	multi_regs;
+			bool	mem_ref;
 		} source;
 		struct {
 			struct ins	    ins;
@@ -83,7 +85,8 @@ struct ins_operands {
 
 struct ins_ops {
 	void (*free)(struct ins_operands *ops);
-	int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms);
+	int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+			struct disasm_line *dl);
 	int (*scnprintf)(struct ins *ins, char *bf, size_t size,
 			 struct ins_operands *ops, int max_ins_name);
 };
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index 01fb25a1150a..75b28dcc8317 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _PERF_DWARF_REGS_H_
 #define _PERF_DWARF_REGS_H_
+#include "annotate.h"
 
 #define DWARF_REG_PC  0xd3af9c /* random number */
 #define DWARF_REG_FB  0xd3affb /* random number */
@@ -31,6 +32,16 @@ static inline int get_dwarf_regnum(const char *name __maybe_unused,
 }
 #endif
 
+#if !defined(__powerpc__) || !defined(HAVE_DWARF_SUPPORT)
+static inline void get_powerpc_regs(u32 raw_insn __maybe_unused, int is_source __maybe_unused,
+		struct annotated_op_loc *op_loc __maybe_unused)
+{
+	return;
+}
+#else
+void get_powerpc_regs(u32 raw_insn, int is_source, struct annotated_op_loc *op_loc);
+#endif
+
 #ifdef HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET
 /*
  * Arch should support fetching the offset of a register in pt_regs
-- 
cgit v1.2.3


From 1acdad68183ab870fca1e63b64f32601be612611 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:50 +0530
Subject: perf annotate: Add parse function for memory instructions in powerpc

Use the raw instruction code and macros to identify memory instructions,
extract register fields and also offset.

The implementation addresses the D-form, X-form, DS-form instructions.
Two main functions are added.

New parse function "load_store__parse" as instruction ops parser for
memory instructions.

Unlike other parsers (like mov__parse), this one fills in the
"multi_regs" field for source/target and new added "mem_ref" field. No
other fields are set because, here there is no need to parse the
disassembled code and arch specific macros will take care of extracting
offset and regs which is easier and will be precise.

In powerpc, all instructions with a primary opcode from 32 to 63
are memory instructions. Update "ins__find" function to have "raw_insn"
also as a parameter.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-8-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/annotate/instructions.c | 16 ++++++++
 tools/perf/util/disasm.c                        | 54 ++++++++++++++++++++++---
 tools/perf/util/disasm.h                        |  2 +-
 3 files changed, 66 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
index d57fd023ef9c..b084423d8477 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -49,6 +49,22 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con
 	return ops;
 }
 
+#define PPC_OP(op)      (((op) >> 26) & 0x3F)
+
+static struct ins_ops *check_ppc_insn(u32 raw_insn)
+{
+	int opcode = PPC_OP(raw_insn);
+
+	/*
+	 * Instructions with opcode 32 to 63 are memory
+	 * instructions in powerpc
+	 */
+	if ((opcode & 0x20))
+		return &load_store_ops;
+
+	return NULL;
+}
+
 static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
 {
 	if (!arch->initialized) {
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 3178dc9bedca..0afa1847f1c5 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -36,6 +36,7 @@ static struct ins_ops mov_ops;
 static struct ins_ops nop_ops;
 static struct ins_ops lock_ops;
 static struct ins_ops ret_ops;
+static struct ins_ops load_store_ops;
 
 static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name);
@@ -521,7 +522,7 @@ static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 	if (disasm_line__parse(ops->raw, &ops->locked.ins.name, &ops->locked.ops->raw) < 0)
 		goto out_free_ops;
 
-	ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name);
+	ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name, 0);
 
 	if (ops->locked.ins.ops == NULL)
 		goto out_free_ops;
@@ -677,6 +678,37 @@ static struct ins_ops mov_ops = {
 	.scnprintf = mov__scnprintf,
 };
 
+static int load_store__scnprintf(struct ins *ins, char *bf, size_t size,
+		struct ins_operands *ops, int max_ins_name)
+{
+	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
+			ops->raw);
+}
+
+/*
+ * Sets the fields: multi_regs and "mem_ref".
+ * "mem_ref" is set for ops->source which is later used to
+ * fill the objdump->memory_ref-char field. This ops is currently
+ * used by powerpc and since binary instruction code is used to
+ * extract opcode, regs and offset, no other parsing is needed here
+ */
+static int load_store__parse(struct arch *arch __maybe_unused, struct ins_operands *ops,
+		struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused)
+{
+	ops->source.mem_ref = true;
+	ops->source.multi_regs = false;
+
+	ops->target.mem_ref = false;
+	ops->target.multi_regs = false;
+
+	return 0;
+}
+
+static struct ins_ops load_store_ops = {
+	.parse     = load_store__parse,
+	.scnprintf = load_store__scnprintf,
+};
+
 static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused,
 		struct disasm_line *dl __maybe_unused)
 {
@@ -768,11 +800,23 @@ static void ins__sort(struct arch *arch)
 	qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
 }
 
-static struct ins_ops *__ins__find(struct arch *arch, const char *name)
+static struct ins_ops *__ins__find(struct arch *arch, const char *name, u32 raw_insn)
 {
 	struct ins *ins;
 	const int nmemb = arch->nr_instructions;
 
+	if (arch__is(arch, "powerpc")) {
+		/*
+		 * For powerpc, identify the instruction ops
+		 * from the opcode using raw_insn.
+		 */
+		struct ins_ops *ops;
+
+		ops = check_ppc_insn(raw_insn);
+		if (ops)
+			return ops;
+	}
+
 	if (!arch->sorted_instructions) {
 		ins__sort(arch);
 		arch->sorted_instructions = true;
@@ -802,9 +846,9 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name)
 	return ins ? ins->ops : NULL;
 }
 
-struct ins_ops *ins__find(struct arch *arch, const char *name)
+struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn)
 {
-	struct ins_ops *ops = __ins__find(arch, name);
+	struct ins_ops *ops = __ins__find(arch, name, raw_insn);
 
 	if (!ops && arch->associate_instruction_ops)
 		ops = arch->associate_instruction_ops(arch, name);
@@ -814,7 +858,7 @@ struct ins_ops *ins__find(struct arch *arch, const char *name)
 
 static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms)
 {
-	dl->ins.ops = ins__find(arch, dl->ins.name);
+	dl->ins.ops = ins__find(arch, dl->ins.name, dl->raw.raw_insn);
 
 	if (!dl->ins.ops)
 		return;
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index 30be0a94ea04..c1bb1e484bfb 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -105,7 +105,7 @@ struct annotate_args {
 struct arch *arch__find(const char *name);
 bool arch__is(struct arch *arch, const char *name);
 
-struct ins_ops *ins__find(struct arch *arch, const char *name);
+struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn);
 int ins__scnprintf(struct ins *ins, char *bf, size_t size,
 		   struct ins_operands *ops, int max_ins_name);
 
-- 
cgit v1.2.3


From ace7d681d82d8ac6cc1c9646b19ab4cca7be0d90 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:51 +0530
Subject: perf annotate: Add support to identify memory instructions of opcode
 31 in powerpc

There are memory instructions in powerpc with opcode as 31.
Example: "ldx RT,RA,RB" , Its X form is as below:

  ______________________________________
  | 31 |  RT  |  RA |  RB |   21     |/|
  --------------------------------------
  0    6     11    16    21         30 31

The opcode for "ldx" is 31. There are other instructions also with
opcode 31 which are memory insn like ldux, stbx, lwzx, lhaux
But all instructions with opcode 31 are not memory. Example is add
instruction: "add RT,RA,RB"

The value in bit 21-30 [ 21 for ldx ] is different for these
instructions. Patch uses this value to assign instruction ops for these
cases. The naming convention and value to identify these are picked from
defines in "arch/powerpc/include/asm/ppc-opcode.h"

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-9-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/annotate/instructions.c | 107 +++++++++++++++++++++++-
 tools/perf/util/disasm.c                        |   3 +
 2 files changed, 108 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
index b084423d8477..1ffb64c6bd0d 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -49,18 +49,121 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con
 	return ops;
 }
 
-#define PPC_OP(op)      (((op) >> 26) & 0x3F)
+#define PPC_OP(op)	(((op) >> 26) & 0x3F)
+#define PPC_21_30(R)	(((R) >> 1) & 0x3ff)
+
+struct insn_offset {
+	const char	*name;
+	int		value;
+};
+
+/*
+ * There are memory instructions with opcode 31 which are
+ * of X Form, Example:
+ * ldx RT,RA,RB
+ * ______________________________________
+ * | 31 |  RT  |  RA |  RB |   21     |/|
+ * --------------------------------------
+ * 0    6     11    16    21         30 31
+ *
+ * But all instructions with opcode 31 are not memory.
+ * Example: add RT,RA,RB
+ *
+ * Use bits 21 to 30 to check memory insns with 31 as opcode.
+ * In ins_array below, for ldx instruction:
+ * name => OP_31_XOP_LDX
+ * value => 21
+ */
+
+static struct insn_offset ins_array[] = {
+	{ .name = "OP_31_XOP_LXSIWZX",  .value = 12, },
+	{ .name = "OP_31_XOP_LWARX",	.value = 20, },
+	{ .name = "OP_31_XOP_LDX",	.value = 21, },
+	{ .name = "OP_31_XOP_LWZX",	.value = 23, },
+	{ .name = "OP_31_XOP_LDUX",	.value = 53, },
+	{ .name = "OP_31_XOP_LWZUX",	.value = 55, },
+	{ .name = "OP_31_XOP_LXSIWAX",  .value = 76, },
+	{ .name = "OP_31_XOP_LDARX",    .value = 84, },
+	{ .name = "OP_31_XOP_LBZX",	.value = 87, },
+	{ .name = "OP_31_XOP_LVX",      .value = 103, },
+	{ .name = "OP_31_XOP_LBZUX",    .value = 119, },
+	{ .name = "OP_31_XOP_STXSIWX",  .value = 140, },
+	{ .name = "OP_31_XOP_STDX",	.value = 149, },
+	{ .name = "OP_31_XOP_STWX",	.value = 151, },
+	{ .name = "OP_31_XOP_STDUX",	.value = 181, },
+	{ .name = "OP_31_XOP_STWUX",	.value = 183, },
+	{ .name = "OP_31_XOP_STBX",	.value = 215, },
+	{ .name = "OP_31_XOP_STVX",     .value = 231, },
+	{ .name = "OP_31_XOP_STBUX",	.value = 247, },
+	{ .name = "OP_31_XOP_LHZX",	.value = 279, },
+	{ .name = "OP_31_XOP_LHZUX",	.value = 311, },
+	{ .name = "OP_31_XOP_LXVDSX",   .value = 332, },
+	{ .name = "OP_31_XOP_LWAX",	.value = 341, },
+	{ .name = "OP_31_XOP_LHAX",	.value = 343, },
+	{ .name = "OP_31_XOP_LWAUX",	.value = 373, },
+	{ .name = "OP_31_XOP_LHAUX",	.value = 375, },
+	{ .name = "OP_31_XOP_STHX",	.value = 407, },
+	{ .name = "OP_31_XOP_STHUX",	.value = 439, },
+	{ .name = "OP_31_XOP_LXSSPX",   .value = 524, },
+	{ .name = "OP_31_XOP_LDBRX",	.value = 532, },
+	{ .name = "OP_31_XOP_LSWX",	.value = 533, },
+	{ .name = "OP_31_XOP_LWBRX",	.value = 534, },
+	{ .name = "OP_31_XOP_LFSUX",    .value = 567, },
+	{ .name = "OP_31_XOP_LXSDX",    .value = 588, },
+	{ .name = "OP_31_XOP_LSWI",	.value = 597, },
+	{ .name = "OP_31_XOP_LFDX",     .value = 599, },
+	{ .name = "OP_31_XOP_LFDUX",    .value = 631, },
+	{ .name = "OP_31_XOP_STXSSPX",  .value = 652, },
+	{ .name = "OP_31_XOP_STDBRX",	.value = 660, },
+	{ .name = "OP_31_XOP_STXWX",	.value = 661, },
+	{ .name = "OP_31_XOP_STWBRX",	.value = 662, },
+	{ .name = "OP_31_XOP_STFSX",	.value = 663, },
+	{ .name = "OP_31_XOP_STFSUX",	.value = 695, },
+	{ .name = "OP_31_XOP_STXSDX",   .value = 716, },
+	{ .name = "OP_31_XOP_STSWI",	.value = 725, },
+	{ .name = "OP_31_XOP_STFDX",	.value = 727, },
+	{ .name = "OP_31_XOP_STFDUX",	.value = 759, },
+	{ .name = "OP_31_XOP_LXVW4X",   .value = 780, },
+	{ .name = "OP_31_XOP_LHBRX",	.value = 790, },
+	{ .name = "OP_31_XOP_LXVD2X",   .value = 844, },
+	{ .name = "OP_31_XOP_LFIWAX",	.value = 855, },
+	{ .name = "OP_31_XOP_LFIWZX",	.value = 887, },
+	{ .name = "OP_31_XOP_STXVW4X",  .value = 908, },
+	{ .name = "OP_31_XOP_STHBRX",	.value = 918, },
+	{ .name = "OP_31_XOP_STXVD2X",  .value = 972, },
+	{ .name = "OP_31_XOP_STFIWX",	.value = 983, },
+};
+
+static int cmp_offset(const void *a, const void *b)
+{
+	const struct insn_offset *val1 = a;
+	const struct insn_offset *val2 = b;
+
+	return (val1->value - val2->value);
+}
 
 static struct ins_ops *check_ppc_insn(u32 raw_insn)
 {
 	int opcode = PPC_OP(raw_insn);
+	int mem_insn_31 = PPC_21_30(raw_insn);
+	struct insn_offset *ret;
+	struct insn_offset mem_insns_31_opcode = {
+		"OP_31_INSN",
+		mem_insn_31
+	};
 
 	/*
 	 * Instructions with opcode 32 to 63 are memory
 	 * instructions in powerpc
 	 */
-	if ((opcode & 0x20))
+	if ((opcode & 0x20)) {
 		return &load_store_ops;
+	} else if (opcode == 31) {
+		/* Check for memory instructions with opcode 31 */
+		ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset);
+		if (ret != NULL)
+			return &load_store_ops;
+	}
 
 	return NULL;
 }
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 0afa1847f1c5..e36be7b7c625 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -697,6 +697,9 @@ static int load_store__parse(struct arch *arch __maybe_unused, struct ins_operan
 {
 	ops->source.mem_ref = true;
 	ops->source.multi_regs = false;
+	/* opcode 31 is of X form */
+	if (PPC_OP(dl->raw.raw_insn) == 31)
+		ops->source.multi_regs = true;
 
 	ops->target.mem_ref = false;
 	ops->target.multi_regs = false;
-- 
cgit v1.2.3


From cd0b6f67c4ab1aecdfedb277c42880fcffe75ace Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:52 +0530
Subject: perf annotate: Add some of the arithmetic instructions to support
 instruction tracking in powerpc

Data-type profiling has the concept of instruction tracking.

Example sequence in powerpc:

	ld      r10,264(r3)
	mr      r31,r3
	<<after some sequence>
	ld      r9,312(r31)

or differently

	lwz	r10,264(r3)
	add	r31, r3, RB
	lwz	r9, 0(r31)

If a sample is hit at "lwz r9, 0(r31)", data type of r31 depends
on previous instruction sequence here. So to track the previous
instructions, patch adds changes to identify some of the arithmetic
instructions which are having opcode as 31.

Since memory instructions also has cases with opcode 31, use the bits
22:30 to filter the arithmetic instructions here.

Also there are instructions with just two operands like "addme", "addze".

This patch adds new instructions ops "arithmetic_ops" to handle this

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-10-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/annotate/instructions.c | 49 ++++++++++++++++++++++++
 tools/perf/util/disasm.c                        | 51 +++++++++++++++++++++++++
 2 files changed, 100 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
index 1ffb64c6bd0d..aa5ee09fa28f 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -51,6 +51,7 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con
 
 #define PPC_OP(op)	(((op) >> 26) & 0x3F)
 #define PPC_21_30(R)	(((R) >> 1) & 0x3ff)
+#define PPC_22_30(R)	(((R) >> 1) & 0x1ff)
 
 struct insn_offset {
 	const char	*name;
@@ -134,6 +135,44 @@ static struct insn_offset ins_array[] = {
 	{ .name = "OP_31_XOP_STFIWX",	.value = 983, },
 };
 
+/*
+ * Arithmetic instructions which are having opcode as 31.
+ * These instructions are tracked to save the register state
+ * changes. Example:
+ *
+ * lwz	r10,264(r3)
+ * add	r31, r3, r3
+ * lwz	r9, 0(r31)
+ *
+ * Here instruction tracking needs to identify the "add"
+ * instruction and save data type of r3 to r31. If a sample
+ * is hit at next "lwz r9, 0(r31)", by this instruction tracking,
+ * data type of r31 can be resolved.
+ */
+static struct insn_offset arithmetic_ins_op_31[] = {
+	{ .name = "SUB_CARRY_XO_FORM",  .value = 8, },
+	{ .name = "MUL_HDW_XO_FORM1",   .value = 9, },
+	{ .name = "ADD_CARRY_XO_FORM",  .value = 10, },
+	{ .name = "MUL_HW_XO_FORM1",    .value = 11, },
+	{ .name = "SUB_XO_FORM",        .value = 40, },
+	{ .name = "MUL_HDW_XO_FORM",    .value = 73, },
+	{ .name = "MUL_HW_XO_FORM",     .value = 75, },
+	{ .name = "SUB_EXT_XO_FORM",    .value = 136, },
+	{ .name = "ADD_EXT_XO_FORM",    .value = 138, },
+	{ .name = "SUB_ZERO_EXT_XO_FORM",       .value = 200, },
+	{ .name = "ADD_ZERO_EXT_XO_FORM",       .value = 202, },
+	{ .name = "SUB_EXT_XO_FORM2",   .value = 232, },
+	{ .name = "MUL_DW_XO_FORM",     .value = 233, },
+	{ .name = "ADD_EXT_XO_FORM2",   .value = 234, },
+	{ .name = "MUL_W_XO_FORM",      .value = 235, },
+	{ .name = "ADD_XO_FORM",	.value = 266, },
+	{ .name = "DIV_DW_XO_FORM1",    .value = 457, },
+	{ .name = "DIV_W_XO_FORM1",     .value = 459, },
+	{ .name = "DIV_DW_XO_FORM",	.value = 489, },
+	{ .name = "DIV_W_XO_FORM",	.value = 491, },
+};
+
+
 static int cmp_offset(const void *a, const void *b)
 {
 	const struct insn_offset *val1 = a;
@@ -163,6 +202,16 @@ static struct ins_ops *check_ppc_insn(u32 raw_insn)
 		ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset);
 		if (ret != NULL)
 			return &load_store_ops;
+		else {
+			mem_insns_31_opcode.value = PPC_22_30(raw_insn);
+			ret = bsearch(&mem_insns_31_opcode, arithmetic_ins_op_31, ARRAY_SIZE(arithmetic_ins_op_31),
+					sizeof(arithmetic_ins_op_31[0]), cmp_offset);
+			if (ret != NULL)
+				return &arithmetic_ops;
+			/* Bits 21 to 30 has value 444 for "mr" insn ie, OR X form */
+			if (PPC_21_30(raw_insn) == 444)
+				return &arithmetic_ops;
+		}
 	}
 
 	return NULL;
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index e36be7b7c625..65915e1f575a 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -37,6 +37,7 @@ static struct ins_ops nop_ops;
 static struct ins_ops lock_ops;
 static struct ins_ops ret_ops;
 static struct ins_ops load_store_ops;
+static struct ins_ops arithmetic_ops;
 
 static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name);
@@ -678,6 +679,56 @@ static struct ins_ops mov_ops = {
 	.scnprintf = mov__scnprintf,
 };
 
+#define PPC_22_30(R)    (((R) >> 1) & 0x1ff)
+#define MINUS_EXT_XO_FORM	234
+#define SUB_EXT_XO_FORM		232
+#define	ADD_ZERO_EXT_XO_FORM	202
+#define	SUB_ZERO_EXT_XO_FORM	200
+
+static int arithmetic__scnprintf(struct ins *ins, char *bf, size_t size,
+		struct ins_operands *ops, int max_ins_name)
+{
+	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
+			ops->raw);
+}
+
+/*
+ * Sets the fields: multi_regs and "mem_ref".
+ * "mem_ref" is set for ops->source which is later used to
+ * fill the objdump->memory_ref-char field. This ops is currently
+ * used by powerpc and since binary instruction code is used to
+ * extract opcode, regs and offset, no other parsing is needed here.
+ *
+ * Dont set multi regs for 4 cases since it has only one operand
+ * for source:
+ * - Add to Minus One Extended XO-form ( Ex: addme, addmeo )
+ * - Subtract From Minus One Extended XO-form ( Ex: subfme )
+ * - Add to Zero Extended XO-form ( Ex: addze, addzeo )
+ * - Subtract From Zero Extended XO-form ( Ex: subfze )
+ */
+static int arithmetic__parse(struct arch *arch __maybe_unused, struct ins_operands *ops,
+		struct map_symbol *ms __maybe_unused, struct disasm_line *dl)
+{
+	int opcode = PPC_OP(dl->raw.raw_insn);
+
+	ops->source.mem_ref = false;
+	if (opcode == 31) {
+		if ((opcode != MINUS_EXT_XO_FORM) && (opcode != SUB_EXT_XO_FORM) \
+				&& (opcode != ADD_ZERO_EXT_XO_FORM) && (opcode != SUB_ZERO_EXT_XO_FORM))
+			ops->source.multi_regs = true;
+	}
+
+	ops->target.mem_ref = false;
+	ops->target.multi_regs = false;
+
+	return 0;
+}
+
+static struct ins_ops arithmetic_ops = {
+	.parse     = arithmetic__parse,
+	.scnprintf = arithmetic__scnprintf,
+};
+
 static int load_store__scnprintf(struct ins *ins, char *bf, size_t size,
 		struct ins_operands *ops, int max_ins_name)
 {
-- 
cgit v1.2.3


From 539bfea3e09c8e7a773b0fc4f6a4b26d921d63ef Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:53 +0530
Subject: perf annotate: Add more instructions for instruction tracking

Add few more instructions and use opcode as search key
to find if it is supported by the architecture.

The added ones are: addi, addic, addic., addis, subfic and mulli

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-11-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/annotate/instructions.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
index aa5ee09fa28f..aa25a336d8d0 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -172,6 +172,14 @@ static struct insn_offset arithmetic_ins_op_31[] = {
 	{ .name = "DIV_W_XO_FORM",	.value = 491, },
 };
 
+static struct insn_offset arithmetic_two_ops[] = {
+	{ .name = "mulli",      .value = 7, },
+	{ .name = "subfic",     .value = 8, },
+	{ .name = "addic",      .value = 12, },
+	{ .name = "addic.",     .value = 13, },
+	{ .name = "addi",       .value = 14, },
+	{ .name = "addis",      .value = 15, },
+};
 
 static int cmp_offset(const void *a, const void *b)
 {
@@ -212,6 +220,12 @@ static struct ins_ops *check_ppc_insn(u32 raw_insn)
 			if (PPC_21_30(raw_insn) == 444)
 				return &arithmetic_ops;
 		}
+	} else {
+		mem_insns_31_opcode.value = opcode;
+		ret = bsearch(&mem_insns_31_opcode, arithmetic_two_ops, ARRAY_SIZE(arithmetic_two_ops),
+				sizeof(arithmetic_two_ops[0]), cmp_offset);
+		if (ret != NULL)
+			return &arithmetic_ops;
 	}
 
 	return NULL;
-- 
cgit v1.2.3


From 88444952bdfe27fcc91b1c499a0f77675db592a9 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:54 +0530
Subject: perf annotate: Update instruction tracking for powerpc

Add instruction tracking function "update_insn_state_powerpc" for
powerpc. Example sequence in powerpc:

  ld      r10,264(r3)
  mr      r31,r3
  <<after some sequence>
  ld      r9,312(r31)

Consider ithe sample is pointing to: "ld r9,312(r31)".

Here the memory reference is hit at "312(r31)" where 312 is the offset
and r31 is the source register.

Previous instruction sequence shows that register state of r3 is moved
to r31.

So to identify the data type for r31 access, the previous instruction
("mr") needs to be tracked and the state type entry has to be updated.

Current instruction tracking support in perf tools infrastructure is
specific to x86. Patch adds this support for powerpc as well.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-12-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/annotate/instructions.c | 59 +++++++++++++++++++++++++
 tools/perf/util/annotate-data.c                 |  9 +++-
 tools/perf/util/disasm.c                        |  3 ++
 3 files changed, 70 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
index aa25a336d8d0..af1032572bf3 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -231,6 +231,65 @@ static struct ins_ops *check_ppc_insn(u32 raw_insn)
 	return NULL;
 }
 
+/*
+ * Instruction tracking function to track register state moves.
+ * Example sequence:
+ *    ld      r10,264(r3)
+ *    mr      r31,r3
+ *    <<after some sequence>
+ *    ld      r9,312(r31)
+ *
+ * Previous instruction sequence shows that register state of r3
+ * is moved to r31. update_insn_state_powerpc tracks these state
+ * changes
+ */
+#ifdef HAVE_DWARF_SUPPORT
+static void update_insn_state_powerpc(struct type_state *state,
+		struct data_loc_info *dloc, Dwarf_Die * cu_die __maybe_unused,
+		struct disasm_line *dl)
+{
+	struct annotated_insn_loc loc;
+	struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE];
+	struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET];
+	struct type_state_reg *tsr;
+	u32 insn_offset = dl->al.offset;
+
+	if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0)
+		return;
+
+	/*
+	 * Value 444 for bits 21:30 is for "mr"
+	 * instruction. "mr" is extended OR. So set the
+	 * source and destination reg correctly
+	 */
+	if (PPC_21_30(dl->raw.raw_insn) == 444) {
+		int src_reg = src->reg1;
+
+		src->reg1 = dst->reg1;
+		dst->reg1 = src_reg;
+	}
+
+	if (!has_reg_type(state, dst->reg1))
+		return;
+
+	tsr = &state->regs[dst->reg1];
+
+	if (!has_reg_type(state, src->reg1) ||
+			!state->regs[src->reg1].ok) {
+		tsr->ok = false;
+		return;
+	}
+
+	tsr->type = state->regs[src->reg1].type;
+	tsr->kind = state->regs[src->reg1].kind;
+	tsr->ok = true;
+
+	pr_debug_dtp("mov [%x] reg%d -> reg%d",
+			insn_offset, src->reg1, dst->reg1);
+	pr_debug_type_name(&tsr->type, tsr->kind);
+}
+#endif /* HAVE_DWARF_SUPPORT */
+
 static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
 {
 	if (!arch->initialized) {
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 7a48c3d72b89..734acdd8c4b7 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1080,6 +1080,13 @@ out:
 	return ret;
 }
 
+static int arch_supports_insn_tracking(struct data_loc_info *dloc)
+{
+	if ((arch__is(dloc->arch, "x86")) || (arch__is(dloc->arch, "powerpc")))
+		return 1;
+	return 0;
+}
+
 /*
  * Construct a list of basic blocks for each scope with variables and try to find
  * the data type by updating a type state table through instructions.
@@ -1094,7 +1101,7 @@ static int find_data_type_block(struct data_loc_info *dloc,
 	int ret = -1;
 
 	/* TODO: other architecture support */
-	if (!arch__is(dloc->arch, "x86"))
+	if (!arch_supports_insn_tracking(dloc))
 		return -1;
 
 	prev_dst_ip = dst_ip = dloc->ip;
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 65915e1f575a..3d25443e1162 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -156,6 +156,9 @@ static struct arch architectures[] = {
 	{
 		.name = "powerpc",
 		.init = powerpc__annotate_init,
+#ifdef HAVE_DWARF_SUPPORT
+		.update_insn_state = update_insn_state_powerpc,
+#endif
 	},
 	{
 		.name = "riscv64",
-- 
cgit v1.2.3


From 1fe86bc245abd1aded01403675d6610455794b5f Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:55 +0530
Subject: perf annotate: Make capstone_init non-static so that it can be used
 during symbol disassemble

symbol__disassemble_capstone in util/disasm.c calls function
open_capstone_handle to open/init the capstone.

We already have a capstone_init function in "util/print_insn.c". But
capstone_init is defined as a static function in util/print_insn.c.

Change this and also add the function in print_insn.h

The open_capstone_handle checks the disassembler_style option from
annotation_options to decide whether to set CS_OPT_SYNTAX_ATT.

Add that logic in capstone_init also and by default set it to true.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-13-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/print_insn.c | 12 +++++++++---
 tools/perf/util/print_insn.h |  5 +++++
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c
index a950e9157d2d..a76aae81d7a0 100644
--- a/tools/perf/util/print_insn.c
+++ b/tools/perf/util/print_insn.c
@@ -32,7 +32,7 @@ size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp)
 #ifdef HAVE_LIBCAPSTONE_SUPPORT
 #include <capstone/capstone.h>
 
-static int capstone_init(struct machine *machine, csh *cs_handle, bool is64)
+int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style)
 {
 	cs_arch arch;
 	cs_mode mode;
@@ -62,7 +62,13 @@ static int capstone_init(struct machine *machine, csh *cs_handle, bool is64)
 	}
 
 	if (machine__normalized_is(machine, "x86")) {
-		cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT);
+		/*
+		 * In case of using capstone_init while symbol__disassemble
+		 * setting CS_OPT_SYNTAX_ATT depends if disassembler_style opts
+		 * is set via annotation args
+		 */
+		if (disassembler_style)
+			cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT);
 		/*
 		 * Resolving address operands to symbols is implemented
 		 * on x86 by investigating instruction details.
@@ -122,7 +128,7 @@ ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpum
 	int ret;
 
 	/* TODO: Try to initiate capstone only once but need a proper place. */
-	ret = capstone_init(machine, &cs_handle, is64bit);
+	ret = capstone_init(machine, &cs_handle, is64bit, true);
 	if (ret < 0)
 		return ret;
 
diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h
index 07d11af3fc1c..2c8ee41c4a5d 100644
--- a/tools/perf/util/print_insn.h
+++ b/tools/perf/util/print_insn.h
@@ -19,4 +19,9 @@ ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpum
 			 bool is64bit, const uint8_t *code, size_t code_size,
 			 uint64_t ip, int *lenp, int print_opts, FILE *fp);
 
+#ifdef HAVE_LIBCAPSTONE_SUPPORT
+#include <capstone/capstone.h>
+int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style);
+#endif
+
 #endif /* PERF_PRINT_INSN_H */
-- 
cgit v1.2.3


From f1e9347c855de73d8df1b0fa763184a46548d62f Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:56 +0530
Subject: perf annotate: Use capstone_init and remove open_capstone_handle from
 disasm.c

capstone_init is made availbale for all archs to use and updated to
enable support for CS_ARCH_PPC as well. Patch removes
open_capstone_handle and uses capstone_init in all the places.

Committer notes:

Avoid including capstone/capstone.h from print_insn.h to not break the
build in builtin-script.c due to the namespace clash with libbpf:

  /usr/include/capstone/bpf.h:94:14: error: 'bpf_insn' defined as wrong kind of tag

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-14-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c     | 2 ++
 tools/perf/util/print_insn.c | 2 ++
 tools/perf/util/print_insn.h | 5 -----
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 3d25443e1162..7a11edefe1bd 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -1512,6 +1512,8 @@ symbol__disassemble_bpf_image(struct symbol *sym,
 #ifdef HAVE_LIBCAPSTONE_SUPPORT
 #include <capstone/capstone.h>
 
+int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style);
+
 static int open_capstone_handle(struct annotate_args *args, bool is_64bit,
 				csh *handle)
 {
diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c
index a76aae81d7a0..a33a7726422d 100644
--- a/tools/perf/util/print_insn.c
+++ b/tools/perf/util/print_insn.c
@@ -32,6 +32,8 @@ size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp)
 #ifdef HAVE_LIBCAPSTONE_SUPPORT
 #include <capstone/capstone.h>
 
+int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style);
+
 int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style)
 {
 	cs_arch arch;
diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h
index 2c8ee41c4a5d..07d11af3fc1c 100644
--- a/tools/perf/util/print_insn.h
+++ b/tools/perf/util/print_insn.h
@@ -19,9 +19,4 @@ ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpum
 			 bool is64bit, const uint8_t *code, size_t code_size,
 			 uint64_t ip, int *lenp, int print_opts, FILE *fp);
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
-#include <capstone/capstone.h>
-int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style);
-#endif
-
 #endif /* PERF_PRINT_INSN_H */
-- 
cgit v1.2.3


From c5d60de1813a9e09ea6d94f82da287d0fa1e1179 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:57 +0530
Subject: perf annotate: Add support to use libcapstone in powerpc

Now perf uses the capstone library to disassemble the instructions in
x86. capstone is used (if available) for perf annotate to speed up.

Currently it only supports x86 architecture.

This patch includes changes to enable this in powerpc.

For now, only for data type sort keys, this method is used and only
binary code (raw instruction) is read. This is because powerpc approach
to understand instructions and reg fields uses raw instruction.

The "cs_disasm" is currently not enabled. While attempting to do
cs_disasm, observation is that some of the instructions were not
identified (ex: extswsli, maddld) and it had to fallback to use objdump.

Hence enabling "cs_disasm" is added in comment section as a TODO for
powerpc.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-15-atrajeev@linux.vnet.ibm.com
[ Use dso__nsinfo(dso) as required to match EXTRA_CFLAGS=-DREFCNT_CHECKING=1 build expectations ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 7a11edefe1bd..6e7a3db9f330 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -1614,6 +1614,144 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
 	}
 }
 
+static int symbol__disassemble_capstone_powerpc(char *filename, struct symbol *sym,
+					struct annotate_args *args)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct map *map = args->ms.map;
+	struct dso *dso = map__dso(map);
+	struct nscookie nsc;
+	u64 start = map__rip_2objdump(map, sym->start);
+	u64 end = map__rip_2objdump(map, sym->end);
+	u64 len = end - start;
+	u64 offset;
+	int i, fd, count;
+	bool is_64bit = false;
+	bool needs_cs_close = false;
+	u8 *buf = NULL;
+	struct find_file_offset_data data = {
+		.ip = start,
+	};
+	csh handle;
+	char disasm_buf[512];
+	struct disasm_line *dl;
+	u32 *line;
+	bool disassembler_style = false;
+
+	if (args->options->objdump_path)
+		return -1;
+
+	nsinfo__mountns_enter(dso__nsinfo(dso), &nsc);
+	fd = open(filename, O_RDONLY);
+	nsinfo__mountns_exit(&nsc);
+	if (fd < 0)
+		return -1;
+
+	if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data,
+			    &is_64bit) == 0)
+		goto err;
+
+	if (!args->options->disassembler_style ||
+			!strcmp(args->options->disassembler_style, "att"))
+		disassembler_style = true;
+
+	if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0)
+		goto err;
+
+	needs_cs_close = true;
+
+	buf = malloc(len);
+	if (buf == NULL)
+		goto err;
+
+	count = pread(fd, buf, len, data.offset);
+	close(fd);
+	fd = -1;
+
+	if ((u64)count != len)
+		goto err;
+
+	line = (u32 *)buf;
+
+	/* add the function address and name */
+	scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:",
+		  start, sym->name);
+
+	args->offset = -1;
+	args->line = disasm_buf;
+	args->line_nr = 0;
+	args->fileloc = NULL;
+	args->ms.sym = sym;
+
+	dl = disasm_line__new(args);
+	if (dl == NULL)
+		goto err;
+
+	annotation_line__add(&dl->al, &notes->src->source);
+
+	/*
+	 * TODO: enable disassm for powerpc
+	 * count = cs_disasm(handle, buf, len, start, len, &insn);
+	 *
+	 * For now, only binary code is saved in disassembled line
+	 * to be used in "type" and "typeoff" sort keys. Each raw code
+	 * is 32 bit instruction. So use "len/4" to get the number of
+	 * entries.
+	 */
+	count = len/4;
+
+	for (i = 0, offset = 0; i < count; i++) {
+		args->offset = offset;
+		sprintf(args->line, "%x", line[i]);
+
+		dl = disasm_line__new(args);
+		if (dl == NULL)
+			goto err;
+
+		annotation_line__add(&dl->al, &notes->src->source);
+
+		offset += 4;
+	}
+
+	/* It failed in the middle */
+	if (offset != len) {
+		struct list_head *list = &notes->src->source;
+
+		/* Discard all lines and fallback to objdump */
+		while (!list_empty(list)) {
+			dl = list_first_entry(list, struct disasm_line, al.node);
+
+			list_del_init(&dl->al.node);
+			disasm_line__free(dl);
+		}
+		count = -1;
+	}
+
+out:
+	if (needs_cs_close)
+		cs_close(&handle);
+	free(buf);
+	return count < 0 ? count : 0;
+
+err:
+	if (fd >= 0)
+		close(fd);
+	if (needs_cs_close) {
+		struct disasm_line *tmp;
+
+		/*
+		 * It probably failed in the middle of the above loop.
+		 * Release any resources it might add.
+		 */
+		list_for_each_entry_safe(dl, tmp, &notes->src->source, al.node) {
+			list_del(&dl->al.node);
+			free(dl);
+		}
+	}
+	count = -1;
+	goto out;
+}
+
 static int symbol__disassemble_capstone(char *filename, struct symbol *sym,
 					struct annotate_args *args)
 {
@@ -1968,6 +2106,11 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 			err = symbol__disassemble_raw(symfs_filename, sym, args);
 			if (err == 0)
 				goto out_remove_tmp;
+#ifdef HAVE_LIBCAPSTONE_SUPPORT
+			err = symbol__disassemble_capstone_powerpc(symfs_filename, sym, args);
+			if (err == 0)
+				goto out_remove_tmp;
+#endif
 		}
 	}
 
-- 
cgit v1.2.3


From 2c9db7475e5de5125b5043fe8f560284b248b0ae Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 18 Jul 2024 14:13:58 +0530
Subject: perf annotate: Set instruction name to be used with insn-stat when
 using raw instruction

Since the "ins.name" is not set while using raw instruction,
'perf annotate' with insn-stat gives wrong data:

Result from "./perf annotate --data-type --insn-stat":

  Annotate Instruction stats
  total 615, ok 419 (68.1%), bad 196 (31.9%)

    Name      :  Good   Bad
    -----------------------------------------------------------
              :   419   196

This patch sets "dl->ins.name" in arch specific function
"check_ppc_insn" while initialising "struct disasm_line".

Also update "ins_find" function to pass "struct disasm_line" as a
parameter so as to set its name field in arch specific call.

With the patch changes:

  Annotate Instruction stats
  total 609, ok 446 (73.2%), bad 163 (26.8%)

  Name/opcode         :  Good   Bad
  -----------------------------------------------------------
  58                  :   323    80
  32                  :    49    43
  34                  :    33    11
  OP_31_XOP_LDX       :     8    20
  40                  :    23     0
  OP_31_XOP_LWARX     :     5     1
  OP_31_XOP_LWZX      :     2     3
  OP_31_XOP_LDARX     :     3     0
  33                  :     0     2
  OP_31_XOP_LBZX      :     0     1
  OP_31_XOP_LWAX      :     0     1
  OP_31_XOP_LHZX      :     0     1

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/lkml/20240718084358.72242-16-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/annotate/instructions.c | 18 +++++++++++++++---
 tools/perf/builtin-annotate.c                   |  4 ++--
 tools/perf/util/annotate.c                      |  2 +-
 tools/perf/util/disasm.c                        | 10 +++++-----
 tools/perf/util/disasm.h                        |  2 +-
 5 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
index af1032572bf3..ede9eeade0ab 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -189,8 +189,9 @@ static int cmp_offset(const void *a, const void *b)
 	return (val1->value - val2->value);
 }
 
-static struct ins_ops *check_ppc_insn(u32 raw_insn)
+static struct ins_ops *check_ppc_insn(struct disasm_line *dl)
 {
+	int raw_insn = dl->raw.raw_insn;
 	int opcode = PPC_OP(raw_insn);
 	int mem_insn_31 = PPC_21_30(raw_insn);
 	struct insn_offset *ret;
@@ -198,19 +199,30 @@ static struct ins_ops *check_ppc_insn(u32 raw_insn)
 		"OP_31_INSN",
 		mem_insn_31
 	};
+	char name_insn[32];
 
 	/*
 	 * Instructions with opcode 32 to 63 are memory
 	 * instructions in powerpc
 	 */
 	if ((opcode & 0x20)) {
+		/*
+		 * Set name in case of raw instruction to
+		 * opcode to be used in insn-stat
+		 */
+		if (!strlen(dl->ins.name)) {
+			sprintf(name_insn, "%d", opcode);
+			dl->ins.name = strdup(name_insn);
+		}
 		return &load_store_ops;
 	} else if (opcode == 31) {
 		/* Check for memory instructions with opcode 31 */
 		ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset);
-		if (ret != NULL)
+		if (ret) {
+			if (!strlen(dl->ins.name))
+				dl->ins.name = strdup(ret->name);
 			return &load_store_ops;
-		else {
+		} else {
 			mem_insns_31_opcode.value = PPC_22_30(raw_insn);
 			ret = bsearch(&mem_insns_31_opcode, arithmetic_ins_op_31, ARRAY_SIZE(arithmetic_ins_op_31),
 					sizeof(arithmetic_ins_op_31[0]), cmp_offset);
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index b10b7f005658..cf60392b1c19 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -396,10 +396,10 @@ static void print_annotate_item_stat(struct list_head *head, const char *title)
 	printf("total %d, ok %d (%.1f%%), bad %d (%.1f%%)\n\n", total,
 	       total_good, 100.0 * total_good / (total ?: 1),
 	       total_bad, 100.0 * total_bad / (total ?: 1));
-	printf("  %-10s: %5s %5s\n", "Name", "Good", "Bad");
+	printf("  %-20s: %5s %5s\n", "Name/opcode", "Good", "Bad");
 	printf("-----------------------------------------------------------\n");
 	list_for_each_entry(istat, head, list)
-		printf("  %-10s: %5d %5d\n", istat->name, istat->good, istat->bad);
+		printf("  %-20s: %5d %5d\n", istat->name, istat->good, istat->bad);
 	printf("\n");
 }
 
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index ce99db291c5e..a2ee4074f768 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2229,7 +2229,7 @@ static struct annotated_item_stat *annotate_data_stat(struct list_head *head,
 		return NULL;
 
 	istat->name = strdup(name);
-	if (istat->name == NULL) {
+	if ((istat->name == NULL) || (!strlen(istat->name))) {
 		free(istat);
 		return NULL;
 	}
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 6e7a3db9f330..410e52cd9cfd 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -857,7 +857,7 @@ static void ins__sort(struct arch *arch)
 	qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
 }
 
-static struct ins_ops *__ins__find(struct arch *arch, const char *name, u32 raw_insn)
+static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct disasm_line *dl)
 {
 	struct ins *ins;
 	const int nmemb = arch->nr_instructions;
@@ -869,7 +869,7 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, u32 raw_
 		 */
 		struct ins_ops *ops;
 
-		ops = check_ppc_insn(raw_insn);
+		ops = check_ppc_insn(dl);
 		if (ops)
 			return ops;
 	}
@@ -903,9 +903,9 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, u32 raw_
 	return ins ? ins->ops : NULL;
 }
 
-struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn)
+struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl)
 {
-	struct ins_ops *ops = __ins__find(arch, name, raw_insn);
+	struct ins_ops *ops = __ins__find(arch, name, dl);
 
 	if (!ops && arch->associate_instruction_ops)
 		ops = arch->associate_instruction_ops(arch, name);
@@ -915,7 +915,7 @@ struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn)
 
 static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms)
 {
-	dl->ins.ops = ins__find(arch, dl->ins.name, dl->raw.raw_insn);
+	dl->ins.ops = ins__find(arch, dl->ins.name, dl);
 
 	if (!dl->ins.ops)
 		return;
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index c1bb1e484bfb..f56beedeb9da 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -105,7 +105,7 @@ struct annotate_args {
 struct arch *arch__find(const char *name);
 bool arch__is(struct arch *arch, const char *name);
 
-struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn);
+struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl);
 int ins__scnprintf(struct ins *ins, char *bf, size_t size,
 		   struct ins_operands *ops, int max_ins_name);
 
-- 
cgit v1.2.3


From 42d37fc0c819b81f6f6afd108b55d04ba9d32d0f Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Tue, 23 Jul 2024 10:51:54 +0530
Subject: perf vendor events power10: Update JSON/events

Update JSON/events for power10 platform with additional events.

Also move PM_VECTOR_LD_CMPL event from others.json to frontend.json
file.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Tested-by: Disha Goel <disgoel@linux.ibm.com>
Cc: Akanksha J N <akanksha@linux.ibm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: hbathini@linux.ibm.com
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20240723052154.96202-1-kjain@linux.ibm.com
[ Remove alternative to ' char that made the build break in some distros with a unicode parsing python error ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../pmu-events/arch/powerpc/power10/frontend.json  |   5 ++
 .../pmu-events/arch/powerpc/power10/others.json    | 100 +++++++++++++++++++--
 2 files changed, 100 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
index 5977f5e64212..53660c279286 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
@@ -74,6 +74,11 @@
     "EventName": "PM_ISSUE_KILL",
     "BriefDescription": "Cycles in which an instruction or group of instructions were cancelled after being issued. This event increments once per occurrence, regardless of how many instructions are included in the issue group."
   },
+  {
+    "EventCode": "0x44054",
+    "EventName": "PM_VECTOR_LD_CMPL",
+    "BriefDescription": "Vector load instruction completed."
+  },
   {
     "EventCode": "0x44056",
     "EventName": "PM_VECTOR_ST_CMPL",
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json
index fcf8a8ebe7bd..3789304cb363 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/others.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/others.json
@@ -94,11 +94,6 @@
     "EventName": "PM_L1_ICACHE_RELOADED_ALL",
     "BriefDescription": "Counts all instruction cache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch."
   },
-  {
-    "EventCode": "0x44054",
-    "EventName": "PM_VECTOR_LD_CMPL",
-    "BriefDescription": "Vector load instruction completed."
-  },
   {
     "EventCode": "0x4D05E",
     "EventName": "PM_BR_CMPL",
@@ -108,5 +103,100 @@
     "EventCode": "0x400F0",
     "EventName": "PM_LD_DEMAND_MISS_L1_FIN",
     "BriefDescription": "Load missed L1, counted at finish time."
+  },
+  {
+    "EventCode": "0x00000038BC",
+    "EventName": "PM_ISYNC_CMPL",
+    "BriefDescription": "Isync completion count per thread."
+  },
+  {
+    "EventCode": "0x000000C088",
+    "EventName": "PM_LD0_32B_FIN",
+    "BriefDescription": "256-bit load finished in the LD0 load execution unit."
+  },
+  {
+    "EventCode": "0x000000C888",
+    "EventName": "PM_LD1_32B_FIN",
+    "BriefDescription": "256-bit load finished in the LD1 load execution unit."
+  },
+  {
+    "EventCode": "0x000000C090",
+    "EventName": "PM_LD0_UNALIGNED_FIN",
+    "BriefDescription": "Load instructions in LD0 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline using the load gather buffer. This typically adds about 10 cycles to the latency of the instruction. This includes loads that cross the 128 byte boundary, octword loads that are not aligned, and a special forward progress case of a load that does not hit in the L1 and crosses the 32 byte boundary and is launched NTC. Counted at finish time."
+  },
+  {
+    "EventCode": "0x000000C890",
+    "EventName": "PM_LD1_UNALIGNED_FIN",
+    "BriefDescription": "Load instructions in LD1 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline using the load gather buffer. This typically adds about 10 cycles to the latency of the instruction. This includes loads that cross the 128 byte boundary, octword loads that are not aligned, and a special forward progress case of a load that does not hit in the L1 and crosses the 32 byte boundary and is launched NTC. Counted at finish time."
+  },
+  {
+    "EventCode": "0x000000C0A4",
+    "EventName": "PM_ST0_UNALIGNED_FIN",
+    "BriefDescription": "Store instructions in ST0 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline. This typically adds about 10 cycles to the latency of the instruction. This only includes stores that cross the 128 byte boundary. Counted at finish time."
+  },
+  {
+    "EventCode": "0x000000C8A4",
+    "EventName": "PM_ST1_UNALIGNED_FIN",
+    "BriefDescription": "Store instructions in ST1 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline. This typically adds about 10 cycles to the latency of the instruction. This only includes stores that cross the 128 byte boundary. Counted at finish time."
+  },
+  {
+    "EventCode": "0x000000C8B8",
+    "EventName": "PM_STCX_SUCCESS_CMPL",
+    "BriefDescription": "STCX instructions that completed successfully. Specifically, counts only when a pass status is returned from the nest."
+  },
+  {
+    "EventCode": "0x000000D0B4",
+    "EventName": "PM_DC_PREF_STRIDED_CONF",
+    "BriefDescription": "A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software."
+  },
+  {
+    "EventCode": "0x000000F880",
+    "EventName": "PM_SNOOP_TLBIE_CYC",
+    "BriefDescription": "Cycles in which TLBIE snoops are executed in the LSU."
+  },
+  {
+    "EventCode": "0x000000F084",
+    "EventName": "PM_SNOOP_TLBIE_CACHE_WALK_CYC",
+    "BriefDescription": "TLBIE snoop cycles in which the data cache is being walked."
+  },
+  {
+    "EventCode": "0x000000F884",
+    "EventName": "PM_SNOOP_TLBIE_WAIT_ST_CYC",
+    "BriefDescription": "TLBIE snoop cycles in which older stores are still draining."
+  },
+  {
+    "EventCode": "0x000000F088",
+    "EventName": "PM_SNOOP_TLBIE_WAIT_LD_CYC",
+    "BriefDescription": "TLBIE snoop cycles in which older loads are still draining."
+  },
+  {
+    "EventCode": "0x000000F08C",
+    "EventName": "PM_SNOOP_TLBIE_WAIT_MMU_CYC",
+    "BriefDescription": "TLBIE snoop cycles in which the Load-Store unit is waiting for the MMU to finish invalidation."
+  },
+  {
+    "EventCode": "0x0000004884",
+    "EventName": "PM_NO_FETCH_IBUF_FULL_CYC",
+    "BriefDescription": "Cycles in which no instructions are fetched because there is no room in the instruction buffers."
+  },
+  {
+    "EventCode": "0x00000048B4",
+    "EventName": "PM_BR_TKN_UNCOND_FIN",
+    "BriefDescription": "An unconditional branch finished. All unconditional branches are taken."
+  },
+  {
+    "EventCode": "0x0B0000016080",
+    "EventName": "PM_L2_TLBIE_SLBIE_START",
+    "BriefDescription": "NCU Master received a TLBIE/SLBIEG/SLBIAG operation from the core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
+  },
+  {
+    "EventCode": "0x0B0000016880",
+    "EventName": "PM_L2_TLBIE_SLBIE_DELAY",
+    "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG command was held in a hottemp condition by the NCU Master. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_TLBIE_SLBIE_SENT to obtain the average time a TLBIE/SLBIEG/SLBIAG command was held. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
+  },
+  {
+    "EventCode": "0x0B0000026880",
+    "EventName": "PM_L2_SNP_TLBIE_SLBIE_DELAY",
+    "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG that targets this thread's LPAR was in flight while in a hottemp condition. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_SNP_TLBIE_SLBIE_START to obtain the overall efficiency. Note: 'inflight' means SnpTLB has been sent to core(ie doesn't include when SnpTLB is in NCU waiting to be launched serially behind different SnpTLB). The NCU Snooper gets in a 'hottemp' delay window when it detects it is above its TLBIE/SLBIE threshold for process SnpTLBIE/SLBIE with this core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
   }
 ]
-- 
cgit v1.2.3


From 050f2a03aaadac13fff13f9ab7c5c1f0937ab729 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Tue, 16 Jul 2024 15:34:05 +0800
Subject: perf annotate: Convert comma to semicolon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace a comma between expression statements by a semicolon.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240716073405.968801-1-nichen@iscas.ac.cn
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index ea986430241e..fe991a81256b 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -985,7 +985,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel,
 
 	browser.b.width = notes->src->widths.max_line_len;
 	browser.b.nr_entries = notes->src->nr_entries;
-	browser.b.entries = &notes->src->source,
+	browser.b.entries = &notes->src->source;
 	browser.b.width += 18; /* Percentage */
 
 	if (annotate_opts.hide_src_code)
-- 
cgit v1.2.3


From e60fc19eab439736009018406a19fad28e60e664 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Tue, 16 Jul 2024 15:43:40 +0800
Subject: perf daemon: Convert comma to semicolon

Replace a comma between expression statements by a semicolon.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240716074340.968909-1-nichen@iscas.ac.cn
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-daemon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index de76bbc50bfb..18f7f417ba11 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -1433,7 +1433,7 @@ static int __cmd_signal(struct daemon *daemon, struct option parent_options[],
 	}
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.signal.cmd = CMD_SIGNAL,
+	cmd.signal.cmd = CMD_SIGNAL;
 	cmd.signal.sig = SIGUSR2;
 	strncpy(cmd.signal.name, name, sizeof(cmd.signal.name) - 1);
 
-- 
cgit v1.2.3


From 496cae1b3306e2ea7bbb8ca7ced082a2046afed9 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Tue, 16 Jul 2024 15:53:47 +0800
Subject: perf inject: Convert comma to semicolon

Replace a comma between expression statements by a semicolon.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240716075347.969041-1-nichen@iscas.ac.cn
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index a212678d47be..7b4a5d56d279 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2069,7 +2069,7 @@ static int __cmd_inject(struct perf_inject *inject)
 		 */
 		inject->tool.finished_init	= host__finished_init;
 		/* Obey finished round ordering */
-		inject->tool.finished_round	= host__finished_round,
+		inject->tool.finished_round	= host__finished_round;
 		/* Keep track of which CPU a VCPU is runnng on */
 		inject->tool.context_switch	= host__context_switch;
 		/*
-- 
cgit v1.2.3


From 4194744602c8cd971ad7fd95710416ca0568d999 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Mon, 22 Jul 2024 16:27:56 +0100
Subject: perf cs-etm: Output 0 instead of 0xdeadbeef when exception packets
 are flushed

Normally exception packets don't directly output a branch sample, but
if they're the last record in a buffer then they will. Because they
don't have addresses set we'll see the placeholder value
CS_ETM_INVAL_ADDR (0xdeadbeef) in the output.

Since commit 6035b6804bdf ("perf cs-etm: Support dummy address value for
CS_ETM_TRACE_ON packet") we've used 0 as an externally visible "not set"
address value. For consistency reasons and to not make exceptions look
like an error, change them to use 0 too.

This is particularly visible when doing userspace only tracing because
trace is disabled when jumping to the kernel, causing the flush and then
forcing the last exception packet to be emitted as a branch. With kernel
trace included, there is no flush so exception packets don't generate
samples until the next range packet and they'll pick up the correct
address.

Before:

  $ perf record -e cs_etm//u -- stress -i 1 -t 1
  $ perf script -F comm,ip,addr,flags

  stress   syscall                    ffffb7eedbc0 => deadbeefdeadbeef
  stress   syscall                    ffffb7f14a14 => deadbeefdeadbeef
  stress   syscall                    ffffb7eedbc0 => deadbeefdeadbeef

After:

  stress   syscall                    ffffb7eedbc0 =>                0
  stress   syscall                    ffffb7f14a14 =>                0
  stress   syscall                    ffffb7eedbc0 =>                0

Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: coresight@lists.linaro.org
Cc: gankulkarni@os.amperecomputing.com
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240722152756.59453-2-james.clark@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cs-etm.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 5e9fbcfad7d4..d3e9c64d17d4 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -1267,8 +1267,12 @@ static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
 
 static inline u64 cs_etm__first_executed_instr(struct cs_etm_packet *packet)
 {
-	/* Returns 0 for the CS_ETM_DISCONTINUITY packet */
-	if (packet->sample_type == CS_ETM_DISCONTINUITY)
+	/*
+	 * Return 0 for packets that have no addresses so that CS_ETM_INVAL_ADDR doesn't
+	 * appear in samples.
+	 */
+	if (packet->sample_type == CS_ETM_DISCONTINUITY ||
+	    packet->sample_type == CS_ETM_EXCEPTION)
 		return 0;
 
 	return packet->start_addr;
-- 
cgit v1.2.3


From ae8e4f4048b839c1cb333d9e3d20e634b430139e Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 23 Jul 2024 14:28:58 +0100
Subject: perf scripts python cs-etm: Restore first sample log in verbose mode

The linked commit moved the early return on the first sample to before
the verbose log, so move the log earlier too. Now the first sample is
also logged and not skipped.

Fixes: 2d98dbb4c9c5b09c ("perf scripts python arm-cs-trace-disasm.py: Do not ignore disam first sample")
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: coresight@lists.linaro.org
Cc: gankulkarni@os.amperecomputing.com
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ruidong Tian <tianruidong@linux.alibaba.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240723132858.12747-1-james.clark@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/python/arm-cs-trace-disasm.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py
index d973c2baed1c..7aff02d84ffb 100755
--- a/tools/perf/scripts/python/arm-cs-trace-disasm.py
+++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py
@@ -192,17 +192,16 @@ def process_event(param_dict):
 	ip = sample["ip"]
 	addr = sample["addr"]
 
+	if (options.verbose == True):
+		print("Event type: %s" % name)
+		print_sample(sample)
+
 	# Initialize CPU data if it's empty, and directly return back
 	# if this is the first tracing event for this CPU.
 	if (cpu_data.get(str(cpu) + 'addr') == None):
 		cpu_data[str(cpu) + 'addr'] = addr
 		return
 
-
-	if (options.verbose == True):
-		print("Event type: %s" % name)
-		print_sample(sample)
-
 	# If cannot find dso so cannot dump assembler, bail out
 	if (dso == '[unknown]'):
 		return
-- 
cgit v1.2.3


From c91928a8d5243f6b7355aab6325fe7545ee65404 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 15 Jul 2024 19:07:04 +0300
Subject: perf tools: Enable evsel__is_aux_event() to work for ARM/ARM64

Set pmu->auxtrace on ARM/ARM64 AUX area PMUs. evsel__is_aux_event() needs
the setting to identify AUX area tracing selected events.

Currently, the features that use evsel__is_aux_event() are used only by
Intel PT, but that may change in the future.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hendrik Brueckner <brueckner@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240715160712.127117-6-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/pmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c
index 1c9541d01722..57dc94a6e38c 100644
--- a/tools/perf/arch/arm/util/pmu.c
+++ b/tools/perf/arch/arm/util/pmu.c
@@ -23,16 +23,19 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
 #ifdef HAVE_AUXTRACE_SUPPORT
 	if (!strcmp(pmu->name, CORESIGHT_ETM_PMU_NAME)) {
 		/* add ETM default config here */
+		pmu->auxtrace = true;
 		pmu->selectable = true;
 		pmu->perf_event_attr_init_default = cs_etm_get_default_config;
 #if defined(__aarch64__)
 	} else if (strstarts(pmu->name, ARM_SPE_PMU_NAME)) {
+		pmu->auxtrace = true;
 		pmu->selectable = true;
 		pmu->is_uncore = false;
 		pmu->perf_event_attr_init_default = arm_spe_pmu_default_config;
 		if (strstarts(pmu->name, "arm_spe_"))
 			pmu->mem_events = perf_mem_events_arm;
 	} else if (strstarts(pmu->name, HISI_PTT_PMU_NAME)) {
+		pmu->auxtrace = true;
 		pmu->selectable = true;
 #endif
 	}
-- 
cgit v1.2.3


From feab89bf991c940e2d44e45ededbde8640ac8830 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 15 Jul 2024 19:07:05 +0300
Subject: perf tools: Enable evsel__is_aux_event() to work for S390_CPUMSF

evsel__is_aux_event() identifies AUX area tracing selected events.

S390_CPUMSF uses a raw event type (PERF_TYPE_RAW - refer
s390_cpumsf_evsel_is_auxtrace()) not a PMU type value that could be checked
in evsel__is_aux_event(). However it sets needs_auxtrace_mmap (refer
auxtrace_record__init()), so check that first.

Currently, the features that use evsel__is_aux_event() are used only by
Intel PT, but that may change in the future.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hendrik Brueckner <brueckner@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240715160712.127117-7-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/pmu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 986166bc7c78..0b38c51bd6eb 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1199,8 +1199,12 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu)
 
 bool evsel__is_aux_event(const struct evsel *evsel)
 {
-	struct perf_pmu *pmu = evsel__find_pmu(evsel);
+	struct perf_pmu *pmu;
+
+	if (evsel->needs_auxtrace_mmap)
+		return true;
 
+	pmu = evsel__find_pmu(evsel);
 	return pmu && pmu->auxtrace;
 }
 
-- 
cgit v1.2.3


From 156e8dcfeceebce0694fb00542ac68e61a785fe1 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sat, 27 Jul 2024 18:59:19 +0100
Subject: perf test pmu: Remove unused test_pmus

Commit aa1551f299ba ("perf test pmu: Refactor format test and exposed
test APIs") added the 'test_pmus' list, but didn't use it.
(It seems to put them on the other_pmus list?)

Remove it.

Fixes: aa1551f299ba414c ("perf test pmu: Refactor format test and exposed test APIs")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Link: https://lore.kernel.org/lkml/20240727175919.1041468-1-linux@treblig.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/pmu.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index 40132655ccd1..0b2f04a55d7b 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -18,9 +18,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
-/* Fake PMUs created in temp directory. */
-static LIST_HEAD(test_pmus);
-
 /* Cleanup test PMU directory. */
 static int test_pmu_put(const char *dir, struct perf_pmu *pmu)
 {
-- 
cgit v1.2.3


From c77800894b5a7d7d37b83862fd732230c5c3e518 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 28 Jul 2024 17:41:24 -0700
Subject: perf ftrace: Add 'tail' option to --graph-opts

The 'graph-tail' option is to print function name as a comment at the end.
This is useful when a large function is mixed with other functions
(possibly from different CPUs).

For example,

  $ sudo perf ftrace -- perf stat true
  ...
   1)               |    get_unused_fd_flags() {
   1)               |      alloc_fd() {
   1)   0.178 us    |        _raw_spin_lock();
   1)   0.187 us    |        expand_files();
   1)   0.169 us    |        _raw_spin_unlock();
   1)   1.211 us    |      }
   1)   1.503 us    |    }

  $ sudo perf ftrace --graph-opts tail -- perf stat true
  ...
   1)               |    get_unused_fd_flags() {
   1)               |      alloc_fd() {
   1)   0.099 us    |        _raw_spin_lock();
   1)   0.083 us    |        expand_files();
   1)   0.081 us    |        _raw_spin_unlock();
   1)   0.601 us    |      } /* alloc_fd */
   1)   0.751 us    |    } /* get_unused_fd_flags */

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/lkml/20240729004127.238611-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-ftrace.txt |  1 +
 tools/perf/builtin-ftrace.c              | 18 ++++++++++++++++++
 tools/perf/util/ftrace.h                 |  1 +
 3 files changed, 20 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt
index d780b93fcf87..7ea1645a13cf 100644
--- a/tools/perf/Documentation/perf-ftrace.txt
+++ b/tools/perf/Documentation/perf-ftrace.txt
@@ -125,6 +125,7 @@ OPTIONS for 'perf ftrace trace'
 	  - verbose      - Show process names, PIDs, timestamps, etc.
 	  - thresh=<n>   - Setup trace duration threshold in microseconds.
 	  - depth=<n>    - Set max depth for function graph tracer to follow.
+	  - tail         - Print function name at the end.
 
 
 OPTIONS for 'perf ftrace latency'
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index eb30c8eca488..33c52ebda2fd 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -228,6 +228,7 @@ static void reset_tracing_options(struct perf_ftrace *ftrace __maybe_unused)
 	write_tracing_option_file("funcgraph-irqs", "1");
 	write_tracing_option_file("funcgraph-proc", "0");
 	write_tracing_option_file("funcgraph-abstime", "0");
+	write_tracing_option_file("funcgraph-tail", "0");
 	write_tracing_option_file("latency-format", "0");
 	write_tracing_option_file("irq-info", "0");
 }
@@ -464,6 +465,17 @@ static int set_tracing_funcgraph_verbose(struct perf_ftrace *ftrace)
 	return 0;
 }
 
+static int set_tracing_funcgraph_tail(struct perf_ftrace *ftrace)
+{
+	if (!ftrace->graph_tail)
+		return 0;
+
+	if (write_tracing_option_file("funcgraph-tail", "1") < 0)
+		return -1;
+
+	return 0;
+}
+
 static int set_tracing_thresh(struct perf_ftrace *ftrace)
 {
 	int ret;
@@ -540,6 +552,11 @@ static int set_tracing_options(struct perf_ftrace *ftrace)
 		return -1;
 	}
 
+	if (set_tracing_funcgraph_tail(ftrace) < 0) {
+		pr_err("failed to set tracing option funcgraph-tail\n");
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -1099,6 +1116,7 @@ static int parse_graph_tracer_opts(const struct option *opt,
 		{ .name = "verbose",		.value_ptr = &ftrace->graph_verbose },
 		{ .name = "thresh",		.value_ptr = &ftrace->graph_thresh },
 		{ .name = "depth",		.value_ptr = &ftrace->graph_depth },
+		{ .name = "tail",		.value_ptr = &ftrace->graph_tail },
 		{ .name = NULL, }
 	};
 
diff --git a/tools/perf/util/ftrace.h b/tools/perf/util/ftrace.h
index 558efcb98d25..2515e841c64c 100644
--- a/tools/perf/util/ftrace.h
+++ b/tools/perf/util/ftrace.h
@@ -25,6 +25,7 @@ struct perf_ftrace {
 	int			graph_noirqs;
 	int			graph_verbose;
 	int			graph_thresh;
+	int			graph_tail;
 };
 
 struct filter_entry {
-- 
cgit v1.2.3


From 608585f43f9e4798db7585d8999c0404103d4b66 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 28 Jul 2024 17:41:25 -0700
Subject: perf ftrace: Factor out check_ftrace_capable()

The check is a common part of the ftrace commands, let's move it out.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/lkml/20240729004127.238611-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-ftrace.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 33c52ebda2fd..978608ecd89c 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -59,6 +59,22 @@ static void ftrace__workload_exec_failed_signal(int signo __maybe_unused,
 	done = true;
 }
 
+static int check_ftrace_capable(void)
+{
+	if (!(perf_cap__capable(CAP_PERFMON) ||
+	      perf_cap__capable(CAP_SYS_ADMIN))) {
+		pr_err("ftrace only works for %s!\n",
+#ifdef HAVE_LIBCAP_SUPPORT
+		"users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
+#else
+		"root"
+#endif
+		);
+		return -1;
+	}
+	return 0;
+}
+
 static int __write_tracing_file(const char *name, const char *val, bool append)
 {
 	char *file;
@@ -586,18 +602,6 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace)
 		.events = POLLIN,
 	};
 
-	if (!(perf_cap__capable(CAP_PERFMON) ||
-	      perf_cap__capable(CAP_SYS_ADMIN))) {
-		pr_err("ftrace only works for %s!\n",
-#ifdef HAVE_LIBCAP_SUPPORT
-		"users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
-#else
-		"root"
-#endif
-		);
-		return -1;
-	}
-
 	select_tracer(ftrace);
 
 	if (reset_tracing_files(ftrace) < 0) {
@@ -902,18 +906,6 @@ static int __cmd_latency(struct perf_ftrace *ftrace)
 	};
 	int buckets[NUM_BUCKET] = { };
 
-	if (!(perf_cap__capable(CAP_PERFMON) ||
-	      perf_cap__capable(CAP_SYS_ADMIN))) {
-		pr_err("ftrace only works for %s!\n",
-#ifdef HAVE_LIBCAP_SUPPORT
-		"users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
-#else
-		"root"
-#endif
-		);
-		return -1;
-	}
-
 	trace_fd = prepare_func_latency(ftrace);
 	if (trace_fd < 0)
 		goto out;
@@ -1220,6 +1212,10 @@ int cmd_ftrace(int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGPIPE, sig_handler);
 
+	ret = check_ftrace_capable();
+	if (ret < 0)
+		return -1;
+
 	ret = perf_config(perf_ftrace_config, &ftrace);
 	if (ret < 0)
 		return -1;
-- 
cgit v1.2.3


From 0f223813edd051a516ec4b1fc23b1fdc00dd3b6d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 28 Jul 2024 17:41:26 -0700
Subject: perf ftrace: Add 'profile' command

The 'perf ftrace profile' command is to get function execution profiles
using function-graph tracer so that users can see the total, average,
max execution time as well as the number of invocations easily.

The following is a profile for the perf_event_open syscall.

  $ sudo perf ftrace profile -G __x64_sys_perf_event_open -- \
    perf stat -e cycles -C1 true 2> /dev/null | head
  # Total (us)   Avg (us)   Max (us)      Count   Function
        65.611     65.611     65.611          1   __x64_sys_perf_event_open
        30.527     30.527     30.527          1   anon_inode_getfile
        30.260     30.260     30.260          1   __anon_inode_getfile
        29.700     29.700     29.700          1   alloc_file_pseudo
        17.578     17.578     17.578          1   d_alloc_pseudo
        17.382     17.382     17.382          1   __d_alloc
        16.738     16.738     16.738          1   kmem_cache_alloc_lru
        15.686     15.686     15.686          1   perf_event_alloc
        14.012      7.006     11.264          2   obj_cgroup_charge
  #

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/lkml/20240729004127.238611-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-ftrace.txt |  42 +++-
 tools/perf/builtin-ftrace.c              | 318 ++++++++++++++++++++++++++++++-
 tools/perf/util/ftrace.h                 |   2 +
 3 files changed, 359 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt
index 7ea1645a13cf..33f32467f287 100644
--- a/tools/perf/Documentation/perf-ftrace.txt
+++ b/tools/perf/Documentation/perf-ftrace.txt
@@ -9,7 +9,7 @@ perf-ftrace - simple wrapper for kernel's ftrace functionality
 SYNOPSIS
 --------
 [verse]
-'perf ftrace' {trace|latency} <command>
+'perf ftrace' {trace|latency|profile} <command>
 
 DESCRIPTION
 -----------
@@ -23,6 +23,9 @@ kernel's ftrace infrastructure.
   'perf ftrace latency' calculates execution latency of a given function
   (optionally with BPF) and display it as a histogram.
 
+  'perf ftrace profile' show a execution profile for each function including
+  total, average, max time and the number of calls.
+
 The following options apply to perf ftrace.
 
 COMMON OPTIONS
@@ -146,6 +149,43 @@ OPTIONS for 'perf ftrace latency'
 	Use nano-second instead of micro-second as a base unit of the histogram.
 
 
+OPTIONS for 'perf ftrace profile'
+---------------------------------
+
+-T::
+--trace-funcs=::
+	Set function filter on the given function (or a glob pattern).
+	Multiple functions can be given by using this option more than once.
+	The function argument also can be a glob pattern. It will be passed
+	to 'set_ftrace_filter' in tracefs.
+
+-N::
+--notrace-funcs=::
+	Do not trace functions given by the argument.  Like -T option, this
+	can be used more than once to specify multiple functions (or glob
+	patterns).  It will be passed to 'set_ftrace_notrace' in tracefs.
+
+-G::
+--graph-funcs=::
+	Set graph filter on the given function (or a glob pattern). This is
+	useful to trace for functions executed from the given function. This
+	can be used more than once to specify multiple functions. It will be
+	passed to 'set_graph_function' in tracefs.
+
+-g::
+--nograph-funcs=::
+	Set graph notrace filter on the given function (or a glob pattern).
+	Like -G option, this is useful for the function_graph tracer only and
+	disables tracing for function executed from the given function. This
+	can be used more than once to specify multiple functions. It will be
+	passed to 'set_graph_notrace' in tracefs.
+
+-m::
+--buffer-size::
+	Set the size of per-cpu tracing buffer, <size> is expected to
+	be a number with appended unit character - B/K/M/G.
+
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-trace[1]
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 978608ecd89c..ae9389435d1b 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -13,6 +13,7 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <fcntl.h>
+#include <inttypes.h>
 #include <math.h>
 #include <poll.h>
 #include <ctype.h>
@@ -22,15 +23,18 @@
 #include "debug.h"
 #include <subcmd/pager.h>
 #include <subcmd/parse-options.h>
+#include <api/io.h>
 #include <api/fs/tracing_path.h>
 #include "evlist.h"
 #include "target.h"
 #include "cpumap.h"
+#include "hashmap.h"
 #include "thread_map.h"
 #include "strfilter.h"
 #include "util/cap.h"
 #include "util/config.h"
 #include "util/ftrace.h"
+#include "util/stat.h"
 #include "util/units.h"
 #include "util/parse-sublevel-options.h"
 
@@ -959,6 +963,294 @@ out:
 	return (done && !workload_exec_errno) ? 0 : -1;
 }
 
+static size_t profile_hash(long func, void *ctx __maybe_unused)
+{
+	return str_hash((char *)func);
+}
+
+static bool profile_equal(long func1, long func2, void *ctx __maybe_unused)
+{
+	return !strcmp((char *)func1, (char *)func2);
+}
+
+static int prepare_func_profile(struct perf_ftrace *ftrace)
+{
+	ftrace->tracer = "function_graph";
+	ftrace->graph_tail = 1;
+
+	ftrace->profile_hash = hashmap__new(profile_hash, profile_equal, NULL);
+	if (ftrace->profile_hash == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* This is saved in a hashmap keyed by the function name */
+struct ftrace_profile_data {
+	struct stats st;
+};
+
+static int add_func_duration(struct perf_ftrace *ftrace, char *func, double time_ns)
+{
+	struct ftrace_profile_data *prof = NULL;
+
+	if (!hashmap__find(ftrace->profile_hash, func, &prof)) {
+		char *key = strdup(func);
+
+		if (key == NULL)
+			return -ENOMEM;
+
+		prof = zalloc(sizeof(*prof));
+		if (prof == NULL) {
+			free(key);
+			return -ENOMEM;
+		}
+
+		init_stats(&prof->st);
+		hashmap__add(ftrace->profile_hash, key, prof);
+	}
+
+	update_stats(&prof->st, time_ns);
+	return 0;
+}
+
+/*
+ * The ftrace function_graph text output normally looks like below:
+ *
+ * CPU   DURATION       FUNCTION
+ *
+ *  0)               |  syscall_trace_enter.isra.0() {
+ *  0)               |    __audit_syscall_entry() {
+ *  0)               |      auditd_test_task() {
+ *  0)   0.271 us    |        __rcu_read_lock();
+ *  0)   0.275 us    |        __rcu_read_unlock();
+ *  0)   1.254 us    |      } /\* auditd_test_task *\/
+ *  0)   0.279 us    |      ktime_get_coarse_real_ts64();
+ *  0)   2.227 us    |    } /\* __audit_syscall_entry *\/
+ *  0)   2.713 us    |  } /\* syscall_trace_enter.isra.0 *\/
+ *
+ *  Parse the line and get the duration and function name.
+ */
+static int parse_func_duration(struct perf_ftrace *ftrace, char *line, size_t len)
+{
+	char *p;
+	char *func;
+	double duration;
+
+	/* skip CPU */
+	p = strchr(line, ')');
+	if (p == NULL)
+		return 0;
+
+	/* get duration */
+	p = skip_spaces(p + 1);
+
+	/* no duration? */
+	if (p == NULL || *p == '|')
+		return 0;
+
+	/* skip markers like '*' or '!' for longer than ms */
+	if (!isdigit(*p))
+		p++;
+
+	duration = strtod(p, &p);
+
+	if (strncmp(p, " us", 3)) {
+		pr_debug("non-usec time found.. ignoring\n");
+		return 0;
+	}
+
+	/*
+	 * profile stat keeps the max and min values as integer,
+	 * convert to nsec time so that we can have accurate max.
+	 */
+	duration *= 1000;
+
+	/* skip to the pipe */
+	while (p < line + len && *p != '|')
+		p++;
+
+	if (*p++ != '|')
+		return -EINVAL;
+
+	/* get function name */
+	func = skip_spaces(p);
+
+	/* skip the closing bracket and the start of comment */
+	if (*func == '}')
+		func += 5;
+
+	/* remove semi-colon or end of comment at the end */
+	p = line + len - 1;
+	while (!isalnum(*p) && *p != ']') {
+		*p = '\0';
+		--p;
+	}
+
+	return add_func_duration(ftrace, func, duration);
+}
+
+static int cmp_profile_data(const void *a, const void *b)
+{
+	const struct hashmap_entry *e1 = *(const struct hashmap_entry **)a;
+	const struct hashmap_entry *e2 = *(const struct hashmap_entry **)b;
+	struct ftrace_profile_data *p1 = e1->pvalue;
+	struct ftrace_profile_data *p2 = e2->pvalue;
+
+	/* compare by total time */
+	if ((p1->st.n * p1->st.mean) > (p2->st.n * p2->st.mean))
+		return -1;
+	else
+		return 1;
+}
+
+static void print_profile_result(struct perf_ftrace *ftrace)
+{
+	struct hashmap_entry *entry, **profile;
+	size_t i, nr, bkt;
+
+	nr = hashmap__size(ftrace->profile_hash);
+	if (nr == 0)
+		return;
+
+	profile = calloc(nr, sizeof(*profile));
+	if (profile == NULL) {
+		pr_err("failed to allocate memory for the result\n");
+		return;
+	}
+
+	i = 0;
+	hashmap__for_each_entry(ftrace->profile_hash, entry, bkt)
+		profile[i++] = entry;
+
+	assert(i == nr);
+
+	//cmp_profile_data(profile[0], profile[1]);
+	qsort(profile, nr, sizeof(*profile), cmp_profile_data);
+
+	printf("# %10s %10s %10s %10s   %s\n",
+	       "Total (us)", "Avg (us)", "Max (us)", "Count", "Function");
+
+	for (i = 0; i < nr; i++) {
+		const char *name = profile[i]->pkey;
+		struct ftrace_profile_data *p = profile[i]->pvalue;
+
+		printf("%12.3f %10.3f %6"PRIu64".%03"PRIu64" %10.0f   %s\n",
+		       p->st.n * p->st.mean / 1000, p->st.mean / 1000,
+		       p->st.max / 1000, p->st.max % 1000, p->st.n, name);
+	}
+
+	free(profile);
+
+	hashmap__for_each_entry(ftrace->profile_hash, entry, bkt) {
+		free((char *)entry->pkey);
+		free(entry->pvalue);
+	}
+
+	hashmap__free(ftrace->profile_hash);
+	ftrace->profile_hash = NULL;
+}
+
+static int __cmd_profile(struct perf_ftrace *ftrace)
+{
+	char *trace_file;
+	int trace_fd;
+	char buf[4096];
+	struct io io;
+	char *line = NULL;
+	size_t line_len = 0;
+
+	if (prepare_func_profile(ftrace) < 0) {
+		pr_err("failed to prepare func profiler\n");
+		goto out;
+	}
+
+	if (reset_tracing_files(ftrace) < 0) {
+		pr_err("failed to reset ftrace\n");
+		goto out;
+	}
+
+	/* reset ftrace buffer */
+	if (write_tracing_file("trace", "0") < 0)
+		goto out;
+
+	if (set_tracing_options(ftrace) < 0)
+		return -1;
+
+	if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
+		pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
+		goto out_reset;
+	}
+
+	setup_pager();
+
+	trace_file = get_tracing_file("trace_pipe");
+	if (!trace_file) {
+		pr_err("failed to open trace_pipe\n");
+		goto out_reset;
+	}
+
+	trace_fd = open(trace_file, O_RDONLY);
+
+	put_tracing_file(trace_file);
+
+	if (trace_fd < 0) {
+		pr_err("failed to open trace_pipe\n");
+		goto out_reset;
+	}
+
+	fcntl(trace_fd, F_SETFL, O_NONBLOCK);
+
+	if (write_tracing_file("tracing_on", "1") < 0) {
+		pr_err("can't enable tracing\n");
+		goto out_close_fd;
+	}
+
+	evlist__start_workload(ftrace->evlist);
+
+	io__init(&io, trace_fd, buf, sizeof(buf));
+	io.timeout_ms = -1;
+
+	while (!done && !io.eof) {
+		if (io__getline(&io, &line, &line_len) < 0)
+			break;
+
+		if (parse_func_duration(ftrace, line, line_len) < 0)
+			break;
+	}
+
+	write_tracing_file("tracing_on", "0");
+
+	if (workload_exec_errno) {
+		const char *emsg = str_error_r(workload_exec_errno, buf, sizeof(buf));
+		/* flush stdout first so below error msg appears at the end. */
+		fflush(stdout);
+		pr_err("workload failed: %s\n", emsg);
+		goto out_free_line;
+	}
+
+	/* read remaining buffer contents */
+	io.timeout_ms = 0;
+	while (!io.eof) {
+		if (io__getline(&io, &line, &line_len) < 0)
+			break;
+
+		if (parse_func_duration(ftrace, line, line_len) < 0)
+			break;
+	}
+
+	print_profile_result(ftrace);
+
+out_free_line:
+	free(line);
+out_close_fd:
+	close(trace_fd);
+out_reset:
+	reset_tracing_files(ftrace);
+out:
+	return (done && !workload_exec_errno) ? 0 : -1;
+}
+
 static int perf_ftrace_config(const char *var, const char *value, void *cb)
 {
 	struct perf_ftrace *ftrace = cb;
@@ -1126,6 +1418,7 @@ enum perf_ftrace_subcommand {
 	PERF_FTRACE_NONE,
 	PERF_FTRACE_TRACE,
 	PERF_FTRACE_LATENCY,
+	PERF_FTRACE_PROFILE,
 };
 
 int cmd_ftrace(int argc, const char **argv)
@@ -1191,13 +1484,28 @@ int cmd_ftrace(int argc, const char **argv)
 		    "Use nano-second histogram"),
 	OPT_PARENT(common_options),
 	};
+	const struct option profile_options[] = {
+	OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func",
+		     "Trace given functions using function tracer",
+		     parse_filter_func),
+	OPT_CALLBACK('N', "notrace-funcs", &ftrace.notrace, "func",
+		     "Do not trace given functions", parse_filter_func),
+	OPT_CALLBACK('G', "graph-funcs", &ftrace.graph_funcs, "func",
+		     "Trace given functions using function_graph tracer",
+		     parse_filter_func),
+	OPT_CALLBACK('g', "nograph-funcs", &ftrace.nograph_funcs, "func",
+		     "Set nograph filter on given functions", parse_filter_func),
+	OPT_CALLBACK('m', "buffer-size", &ftrace.percpu_buffer_size, "size",
+		     "Size of per cpu buffer, needs to use a B, K, M or G suffix.", parse_buffer_size),
+	OPT_PARENT(common_options),
+	};
 	const struct option *options = ftrace_options;
 
 	const char * const ftrace_usage[] = {
 		"perf ftrace [<options>] [<command>]",
 		"perf ftrace [<options>] -- [<command>] [<options>]",
-		"perf ftrace {trace|latency} [<options>] [<command>]",
-		"perf ftrace {trace|latency} [<options>] -- [<command>] [<options>]",
+		"perf ftrace {trace|latency|profile} [<options>] [<command>]",
+		"perf ftrace {trace|latency|profile} [<options>] -- [<command>] [<options>]",
 		NULL
 	};
 	enum perf_ftrace_subcommand subcmd = PERF_FTRACE_NONE;
@@ -1226,6 +1534,9 @@ int cmd_ftrace(int argc, const char **argv)
 		} else if (!strcmp(argv[1], "latency")) {
 			subcmd = PERF_FTRACE_LATENCY;
 			options = latency_options;
+		} else if (!strcmp(argv[1], "profile")) {
+			subcmd = PERF_FTRACE_PROFILE;
+			options = profile_options;
 		}
 
 		if (subcmd != PERF_FTRACE_NONE) {
@@ -1261,6 +1572,9 @@ int cmd_ftrace(int argc, const char **argv)
 		}
 		cmd_func = __cmd_latency;
 		break;
+	case PERF_FTRACE_PROFILE:
+		cmd_func = __cmd_profile;
+		break;
 	case PERF_FTRACE_NONE:
 	default:
 		pr_err("Invalid subcommand\n");
diff --git a/tools/perf/util/ftrace.h b/tools/perf/util/ftrace.h
index 2515e841c64c..bae649ef50e8 100644
--- a/tools/perf/util/ftrace.h
+++ b/tools/perf/util/ftrace.h
@@ -6,6 +6,7 @@
 #include "target.h"
 
 struct evlist;
+struct hashamp;
 
 struct perf_ftrace {
 	struct evlist		*evlist;
@@ -15,6 +16,7 @@ struct perf_ftrace {
 	struct list_head	notrace;
 	struct list_head	graph_funcs;
 	struct list_head	nograph_funcs;
+	struct hashmap		*profile_hash;
 	unsigned long		percpu_buffer_size;
 	bool			inherit;
 	bool			use_nsec;
-- 
cgit v1.2.3


From 74ae366c37b71b46be7f2fa45fa4b2c9c6708fbe Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 28 Jul 2024 17:41:27 -0700
Subject: perf ftrace profile: Add -s/--sort option

The -s/--sort option is to sort the output by given column.

  $ sudo perf ftrace profile -s max sync | head
  # Total (us)   Avg (us)   Max (us)      Count   Function
      6301.811   6301.811   6301.811          1   __do_sys_sync
      6301.328   6301.328   6301.328          1   ksys_sync
      5320.300   1773.433   2858.819          3   iterate_supers
      2755.875     17.012   2610.633        162   sync_fs_one_sb
      2728.351    682.088   2610.413          4   ext4_sync_fs [ext4]
      2603.654   2603.654   2603.654          1   jbd2_log_wait_commit [jbd2]
      4750.615    593.827   2597.427          8   schedule
      2164.986     26.728   2115.673         81   sync_inodes_one_sb
      2143.842     26.467   2115.438         81   sync_inodes_sb

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/lkml/20240729004127.238611-5-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-ftrace.txt |  5 +++
 tools/perf/builtin-ftrace.c              | 63 +++++++++++++++++++++++++++++++-
 2 files changed, 66 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt
index 33f32467f287..eaec8253be68 100644
--- a/tools/perf/Documentation/perf-ftrace.txt
+++ b/tools/perf/Documentation/perf-ftrace.txt
@@ -185,6 +185,11 @@ OPTIONS for 'perf ftrace profile'
 	Set the size of per-cpu tracing buffer, <size> is expected to
 	be a number with appended unit character - B/K/M/G.
 
+-s::
+--sort=::
+	Sort the result by the given field.  Available values are:
+	total, avg, max, count, name.  Default is 'total'.
+
 
 SEE ALSO
 --------
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index ae9389435d1b..a615c405d98f 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -1090,15 +1090,47 @@ static int parse_func_duration(struct perf_ftrace *ftrace, char *line, size_t le
 	return add_func_duration(ftrace, func, duration);
 }
 
+enum perf_ftrace_profile_sort_key {
+	PFP_SORT_TOTAL = 0,
+	PFP_SORT_AVG,
+	PFP_SORT_MAX,
+	PFP_SORT_COUNT,
+	PFP_SORT_NAME,
+};
+
+static enum perf_ftrace_profile_sort_key profile_sort = PFP_SORT_TOTAL;
+
 static int cmp_profile_data(const void *a, const void *b)
 {
 	const struct hashmap_entry *e1 = *(const struct hashmap_entry **)a;
 	const struct hashmap_entry *e2 = *(const struct hashmap_entry **)b;
 	struct ftrace_profile_data *p1 = e1->pvalue;
 	struct ftrace_profile_data *p2 = e2->pvalue;
+	double v1, v2;
+
+	switch (profile_sort) {
+	case PFP_SORT_NAME:
+		return strcmp(e1->pkey, e2->pkey);
+	case PFP_SORT_AVG:
+		v1 = p1->st.mean;
+		v2 = p2->st.mean;
+		break;
+	case PFP_SORT_MAX:
+		v1 = p1->st.max;
+		v2 = p2->st.max;
+		break;
+	case PFP_SORT_COUNT:
+		v1 = p1->st.n;
+		v2 = p2->st.n;
+		break;
+	case PFP_SORT_TOTAL:
+	default:
+		v1 = p1->st.n * p1->st.mean;
+		v2 = p2->st.n * p2->st.mean;
+		break;
+	}
 
-	/* compare by total time */
-	if ((p1->st.n * p1->st.mean) > (p2->st.n * p2->st.mean))
+	if (v1 > v2)
 		return -1;
 	else
 		return 1;
@@ -1414,6 +1446,30 @@ static int parse_graph_tracer_opts(const struct option *opt,
 	return 0;
 }
 
+static int parse_sort_key(const struct option *opt, const char *str, int unset)
+{
+	enum perf_ftrace_profile_sort_key *key = (void *)opt->value;
+
+	if (unset)
+		return 0;
+
+	if (!strcmp(str, "total"))
+		*key = PFP_SORT_TOTAL;
+	else if (!strcmp(str, "avg"))
+		*key = PFP_SORT_AVG;
+	else if (!strcmp(str, "max"))
+		*key = PFP_SORT_MAX;
+	else if (!strcmp(str, "count"))
+		*key = PFP_SORT_COUNT;
+	else if (!strcmp(str, "name"))
+		*key = PFP_SORT_NAME;
+	else {
+		pr_err("Unknown sort key: %s\n", str);
+		return -1;
+	}
+	return 0;
+}
+
 enum perf_ftrace_subcommand {
 	PERF_FTRACE_NONE,
 	PERF_FTRACE_TRACE,
@@ -1497,6 +1553,9 @@ int cmd_ftrace(int argc, const char **argv)
 		     "Set nograph filter on given functions", parse_filter_func),
 	OPT_CALLBACK('m', "buffer-size", &ftrace.percpu_buffer_size, "size",
 		     "Size of per cpu buffer, needs to use a B, K, M or G suffix.", parse_buffer_size),
+	OPT_CALLBACK('s', "sort", &profile_sort, "key",
+		     "Sort result by key: total (default), avg, max, count, name.",
+		     parse_sort_key),
 	OPT_PARENT(common_options),
 	};
 	const struct option *options = ftrace_options;
-- 
cgit v1.2.3


From 4ed0f392e7dbd2e90a903bdd77d1f6e61b7d3073 Mon Sep 17 00:00:00 2001
From: Weilin Wang <weilin.wang@intel.com>
Date: Wed, 22 May 2024 13:42:54 -0700
Subject: perf test: make metric validation test return early when there is no
 metric supported on the test system

Add a check to return the metric validation test early when perf list metric
does not output any metric. This would happen when NO_JEVENTS=1 is set or in a
system that there is no metric supported.

Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Tested-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samantha Alt <samantha.alt@intel.com>
Link: https://lore.kernel.org/lkml/20240522204254.1841420-1-weilin.wang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/lib/perf_metric_validation.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/lib/perf_metric_validation.py b/tools/perf/tests/shell/lib/perf_metric_validation.py
index a2d235252183..0b94216c9c46 100644
--- a/tools/perf/tests/shell/lib/perf_metric_validation.py
+++ b/tools/perf/tests/shell/lib/perf_metric_validation.py
@@ -95,7 +95,7 @@ class Validator:
                       indent=4)
 
     def get_results(self, idx: int = 0):
-        return self.results[idx]
+        return self.results.get(idx)
 
     def get_bounds(self, lb, ub, error, alias={}, ridx: int = 0) -> list:
         """
@@ -173,7 +173,10 @@ class Validator:
         pcnt = 0
         tcnt = 0
         rerun = list()
-        for name, val in self.get_results().items():
+        results = self.get_results()
+        if not results:
+            return
+        for name, val in results.items():
             if val < 0:
                 negmetric[name] = val
                 rerun.append(name)
@@ -532,6 +535,9 @@ class Validator:
         '''
         if not self.collectlist:
             self.parse_perf_metrics()
+        if not self.metrics:
+            print("No metric found for testing")
+            return 0
         self.create_rules()
         for i in range(0, len(self.workloads)):
             self.wlidx = i
-- 
cgit v1.2.3


From ccd6fcda257573081b308f3f8f002a15810eba7c Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Thu, 25 Jul 2024 07:46:19 +0100
Subject: perf arm-spe: Extract evsel setting up

The evsel for Arm SPE PMU needs to be set up. Extract the setting up
into a function arm_spe_setup_evsel().

Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc:  <coresight@lists.linaro.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc:  <linux-perf-users@vger.kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/util/arm-spe.c | 74 +++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index 0b52e67edb3b..fe7942824113 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -132,6 +132,45 @@ static __u64 arm_spe_pmu__sample_period(const struct perf_pmu *arm_spe_pmu)
 	return sample_period;
 }
 
+static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus)
+{
+	u64 bit;
+
+	evsel->core.attr.freq = 0;
+	evsel->core.attr.sample_period = arm_spe_pmu__sample_period(evsel->pmu);
+	evsel->needs_auxtrace_mmap = true;
+
+	/*
+	 * To obtain the auxtrace buffer file descriptor, the auxtrace event
+	 * must come first.
+	 */
+	evlist__to_front(evsel->evlist, evsel);
+
+	/*
+	 * In the case of per-cpu mmaps, sample CPU for AUX event;
+	 * also enable the timestamp tracing for samples correlation.
+	 */
+	if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) {
+		evsel__set_sample_bit(evsel, CPU);
+		evsel__set_config_if_unset(evsel->pmu, evsel, "ts_enable", 1);
+	}
+
+	/*
+	 * Set this only so that perf report knows that SPE generates memory info. It has no effect
+	 * on the opening of the event or the SPE data produced.
+	 */
+	evsel__set_sample_bit(evsel, DATA_SRC);
+
+	/*
+	 * The PHYS_ADDR flag does not affect the driver behaviour, it is used to
+	 * inform that the resulting output's SPE samples contain physical addresses
+	 * where applicable.
+	 */
+	bit = perf_pmu__format_bits(evsel->pmu, "pa_enable");
+	if (evsel->core.attr.config & bit)
+		evsel__set_sample_bit(evsel, PHYS_ADDR);
+}
+
 static int arm_spe_recording_options(struct auxtrace_record *itr,
 				     struct evlist *evlist,
 				     struct record_opts *opts)
@@ -144,7 +183,6 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 	bool privileged = perf_event_paranoid_check(-1);
 	struct evsel *tracking_evsel;
 	int err;
-	u64 bit;
 
 	sper->evlist = evlist;
 
@@ -154,9 +192,6 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 				pr_err("There may be only one " ARM_SPE_PMU_NAME "x event\n");
 				return -EINVAL;
 			}
-			evsel->core.attr.freq = 0;
-			evsel->core.attr.sample_period = arm_spe_pmu__sample_period(arm_spe_pmu);
-			evsel->needs_auxtrace_mmap = true;
 			arm_spe_evsel = evsel;
 			opts->full_auxtrace = true;
 		}
@@ -222,36 +257,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 		pr_debug2("%sx snapshot size: %zu\n", ARM_SPE_PMU_NAME,
 			  opts->auxtrace_snapshot_size);
 
-	/*
-	 * To obtain the auxtrace buffer file descriptor, the auxtrace event
-	 * must come first.
-	 */
-	evlist__to_front(evlist, arm_spe_evsel);
-
-	/*
-	 * In the case of per-cpu mmaps, sample CPU for AUX event;
-	 * also enable the timestamp tracing for samples correlation.
-	 */
-	if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) {
-		evsel__set_sample_bit(arm_spe_evsel, CPU);
-		evsel__set_config_if_unset(arm_spe_pmu, arm_spe_evsel,
-					   "ts_enable", 1);
-	}
-
-	/*
-	 * Set this only so that perf report knows that SPE generates memory info. It has no effect
-	 * on the opening of the event or the SPE data produced.
-	 */
-	evsel__set_sample_bit(arm_spe_evsel, DATA_SRC);
-
-	/*
-	 * The PHYS_ADDR flag does not affect the driver behaviour, it is used to
-	 * inform that the resulting output's SPE samples contain physical addresses
-	 * where applicable.
-	 */
-	bit = perf_pmu__format_bits(arm_spe_pmu, "pa_enable");
-	if (arm_spe_evsel->core.attr.config & bit)
-		evsel__set_sample_bit(arm_spe_evsel, PHYS_ADDR);
+	arm_spe_setup_evsel(arm_spe_evsel, cpus);
 
 	/* Add dummy event to keep tracking */
 	err = parse_event(evlist, "dummy:u");
-- 
cgit v1.2.3


From 1635bdca4b02bbe6529e769dd7d56a98960344fc Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Thu, 25 Jul 2024 07:46:20 +0100
Subject: perf arm-spe: Support multiple Arm SPE events

As the flag 'auxtrace' has been set for Arm SPE events, now it is ready
to use evsel__is_aux_event() to check if an event is AUX trace event or
not. Use this function to replace the old checking for only the first
Arm SPE event.

Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc:  <coresight@lists.linaro.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc:  <linux-perf-users@vger.kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/util/arm-spe.c | 37 +++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index fe7942824113..d59f6ca499f2 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 #include <linux/log2.h>
+#include <linux/string.h>
 #include <linux/zalloc.h>
 #include <time.h>
 
@@ -177,8 +178,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 {
 	struct arm_spe_recording *sper =
 			container_of(itr, struct arm_spe_recording, itr);
-	struct perf_pmu *arm_spe_pmu = sper->arm_spe_pmu;
-	struct evsel *evsel, *arm_spe_evsel = NULL;
+	struct evsel *evsel, *tmp;
 	struct perf_cpu_map *cpus = evlist->core.user_requested_cpus;
 	bool privileged = perf_event_paranoid_check(-1);
 	struct evsel *tracking_evsel;
@@ -187,12 +187,12 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 	sper->evlist = evlist;
 
 	evlist__for_each_entry(evlist, evsel) {
-		if (evsel->core.attr.type == arm_spe_pmu->type) {
-			if (arm_spe_evsel) {
-				pr_err("There may be only one " ARM_SPE_PMU_NAME "x event\n");
+		if (evsel__is_aux_event(evsel)) {
+			if (!strstarts(evsel->pmu_name, ARM_SPE_PMU_NAME)) {
+				pr_err("Found unexpected auxtrace event: %s\n",
+				       evsel->pmu_name);
 				return -EINVAL;
 			}
-			arm_spe_evsel = evsel;
 			opts->full_auxtrace = true;
 		}
 	}
@@ -257,7 +257,10 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 		pr_debug2("%sx snapshot size: %zu\n", ARM_SPE_PMU_NAME,
 			  opts->auxtrace_snapshot_size);
 
-	arm_spe_setup_evsel(arm_spe_evsel, cpus);
+	evlist__for_each_entry_safe(evlist, tmp, evsel) {
+		if (evsel__is_aux_event(evsel))
+			arm_spe_setup_evsel(evsel, cpus);
+	}
 
 	/* Add dummy event to keep tracking */
 	err = parse_event(evlist, "dummy:u");
@@ -307,12 +310,16 @@ static int arm_spe_snapshot_start(struct auxtrace_record *itr)
 	struct arm_spe_recording *ptr =
 			container_of(itr, struct arm_spe_recording, itr);
 	struct evsel *evsel;
+	int ret = -EINVAL;
 
 	evlist__for_each_entry(ptr->evlist, evsel) {
-		if (evsel->core.attr.type == ptr->arm_spe_pmu->type)
-			return evsel__disable(evsel);
+		if (evsel__is_aux_event(evsel)) {
+			ret = evsel__disable(evsel);
+			if (ret < 0)
+				return ret;
+		}
 	}
-	return -EINVAL;
+	return ret;
 }
 
 static int arm_spe_snapshot_finish(struct auxtrace_record *itr)
@@ -320,12 +327,16 @@ static int arm_spe_snapshot_finish(struct auxtrace_record *itr)
 	struct arm_spe_recording *ptr =
 			container_of(itr, struct arm_spe_recording, itr);
 	struct evsel *evsel;
+	int ret = -EINVAL;
 
 	evlist__for_each_entry(ptr->evlist, evsel) {
-		if (evsel->core.attr.type == ptr->arm_spe_pmu->type)
-			return evsel__enable(evsel);
+		if (evsel__is_aux_event(evsel)) {
+			ret = evsel__enable(evsel);
+			if (ret < 0)
+				return ret;
+		}
 	}
-	return -EINVAL;
+	return ret;
 }
 
 static int arm_spe_alloc_wrapped_array(struct arm_spe_recording *ptr, int idx)
-- 
cgit v1.2.3


From d261f9ebcf424535fe04e720a1cfa023be409f52 Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Mon, 29 Jul 2024 16:11:02 -0700
Subject: libperf: Add gitignore

Ignore files that are generated by libperf and libperf tests.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/20240729-libperf_gitignore-v1-1-1c70dd98edf9@rivosinc.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/.gitignore | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 tools/lib/perf/.gitignore

(limited to 'tools')

diff --git a/tools/lib/perf/.gitignore b/tools/lib/perf/.gitignore
new file mode 100644
index 000000000000..0f5b4af63f62
--- /dev/null
+++ b/tools/lib/perf/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+libperf.pc
+libperf.so.*
+tests-shared
+tests-static
-- 
cgit v1.2.3


From 839b1832e68a5b7d8c81c9281296b03e5ba7c31a Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 30 Jul 2024 14:23:00 +0800
Subject: perf tools: Fix wrong message when running "make JOBS=1"

There is only one job when running "make JOBS=1", it should
print "sequential build" rather than "parallel build".

Before:

$ cd tools/perf && make JOBS=1
  BUILD:   Doing 'make -j1' parallel build

After:

$ cd tools/perf && make JOBS=1
  BUILD:   Doing 'make -j1' sequential build

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/20240730062301.23244-2-yangtiezhu@loongson.cn
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 75f3f6e0a231..816d5d84816b 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -51,8 +51,14 @@ else
   override DEBUG = 0
 endif
 
+ifeq ($(JOBS),1)
+  BUILD_TYPE := sequential
+else
+  BUILD_TYPE := parallel
+endif
+
 define print_msg
-  @printf '  BUILD:   Doing '\''make \033[33m-j'$(JOBS)'\033[m'\'' parallel build\n'
+  @printf '  BUILD:   Doing '\''make \033[33m-j'$(JOBS)'\033[m'\'' $(BUILD_TYPE) build\n'
 endef
 
 define make
-- 
cgit v1.2.3


From b48543c451c30387b53ee6e202dda8d5303f6268 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 30 Jul 2024 14:23:01 +0800
Subject: perf list: Give clues if failed to open tracing events directory

When executing the command "perf list", I met "Error: failed to open
tracing events directory" twice, the first reason is that there is no
"/sys/kernel/tracing/events" directory due to it does not enable the
kernel tracing infrastructure with CONFIG_FTRACE, the second reason
is that there is no root privileges.

Add the error string to tell the users what happened and what should
to do, and also call put_tracing_file() to free events_path a little
later to avoid messy code in the error message.

At the same time, just remove the redundant "/" of the file path in
the function get_tracing_file(), otherwise it shows something like
"/sys/kernel/tracing//events".

Before:

  $ ./perf list
  Error: failed to open tracing events directory

After:

(1) Without CONFIG_FTRACE

  $ ./perf list
  Error: failed to open tracing events directory
  /sys/kernel/tracing/events: No such file or directory

(2) With CONFIG_FTRACE but no root privileges

  $ ./perf list
  Error: failed to open tracing events directory
  /sys/kernel/tracing/events: Permission denied

Committer testing:

Redirect stdout to null to quickly test the patch:

Before:

  $ perf list > /dev/null
  Error: failed to open tracing events directory
  $

After:

  $ perf list > /dev/null
  Error: failed to open tracing events directory
  /sys/kernel/tracing/events: Permission denied
  $

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/20240730062301.23244-3-yangtiezhu@loongson.cn
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/api/fs/tracing_path.c | 2 +-
 tools/perf/util/print-events.c  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/api/fs/tracing_path.c b/tools/lib/api/fs/tracing_path.c
index 30745f35d0d2..834fd64c7130 100644
--- a/tools/lib/api/fs/tracing_path.c
+++ b/tools/lib/api/fs/tracing_path.c
@@ -69,7 +69,7 @@ char *get_tracing_file(const char *name)
 {
 	char *file;
 
-	if (asprintf(&file, "%s/%s", tracing_path_mount(), name) < 0)
+	if (asprintf(&file, "%s%s", tracing_path_mount(), name) < 0)
 		return NULL;
 
 	return file;
diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
index 3f38c27f0157..81e0135cddf0 100644
--- a/tools/perf/util/print-events.c
+++ b/tools/perf/util/print-events.c
@@ -68,11 +68,12 @@ void print_tracepoint_events(const struct print_callbacks *print_cb __maybe_unus
 	struct dirent **sys_namelist = NULL;
 	int sys_items;
 
-	put_tracing_file(events_path);
 	if (events_fd < 0) {
 		pr_err("Error: failed to open tracing events directory\n");
+		pr_err("%s: %s\n", events_path, strerror(errno));
 		return;
 	}
+	put_tracing_file(events_path);
 
 	sys_items = tracing_events__scandir_alphasort(&sys_namelist);
 
-- 
cgit v1.2.3


From 0f2c0400b560a3752fac9e583c1576a53a51cde9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 30 Jul 2024 12:17:43 -0700
Subject: perf jevents: Use name for special find value (PMU_EVENTS__NOT_FOUND)

-1000 was used as a special value added in Commit 3d5045492ab2 ("perf
pmu-events: Add pmu_events_table__find_event()") to show that 1 table
lacked a PMU/event but that didn't terminate the search in other
tables.

Add a new constant PMU_EVENTS__NOT_FOUND for this value and use it.

Reviewed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oliver Sang <oliver.sang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Philip Li <philip.li@intel.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Link: https://lore.kernel.org/r/20240730191744.3097329-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/jevents.py   | 6 +++---
 tools/perf/pmu-events/pmu-events.h | 9 +++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py
index ac9b7ca41856..731776e29f47 100755
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -906,7 +906,7 @@ static int pmu_events_table__find_event_pmu(const struct pmu_events_table *table
   do_call:
                 return fn ? fn(&pe, table, data) : 0;
         }
-        return -1000;
+        return PMU_EVENTS__NOT_FOUND;
 }
 
 int pmu_events_table__for_each_event(const struct pmu_events_table *table,
@@ -944,10 +944,10 @@ int pmu_events_table__find_event(const struct pmu_events_table *table,
                         continue;
 
                 ret = pmu_events_table__find_event_pmu(table, table_pmu, name, fn, data);
-                if (ret != -1000)
+                if (ret != PMU_EVENTS__NOT_FOUND)
                         return ret;
         }
-        return -1000;
+        return PMU_EVENTS__NOT_FOUND;
 }
 
 size_t pmu_events_table__num_events(const struct pmu_events_table *table,
diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h
index f5aa96f1685c..5435ad92180c 100644
--- a/tools/perf/pmu-events/pmu-events.h
+++ b/tools/perf/pmu-events/pmu-events.h
@@ -70,6 +70,8 @@ struct pmu_metric {
 struct pmu_events_table;
 struct pmu_metrics_table;
 
+#define PMU_EVENTS__NOT_FOUND -1000
+
 typedef int (*pmu_event_iter_fn)(const struct pmu_event *pe,
 				 const struct pmu_events_table *table,
 				 void *data);
@@ -82,6 +84,13 @@ int pmu_events_table__for_each_event(const struct pmu_events_table *table,
 				    struct perf_pmu *pmu,
 				    pmu_event_iter_fn fn,
 				    void *data);
+/*
+ * Search for table and entry matching with pmu__name_match. Each matching event
+ * has fn called on it. 0 implies to success/continue the search while non-zero
+ * means to terminate. The special value PMU_EVENTS__NOT_FOUND is used to
+ * indicate no event was found in one of the tables which doesn't terminate the
+ * search of all tables.
+ */
 int pmu_events_table__find_event(const struct pmu_events_table *table,
                                  struct perf_pmu *pmu,
                                  const char *name,
-- 
cgit v1.2.3


From edb08cdd10b57d30631492e534f5bc13a78e739b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Jul 2024 15:30:28 -0700
Subject: perf bpf-filter: Make filters map a single entry hashmap

And the value is now an array.  This is to support multiple filter
entries in the map later.

No functional changes intended.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240703223035.2024586-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-filter.c                 | 81 +++++++++++++++++++---------
 tools/perf/util/bpf_skel/sample-filter.h     |  1 +
 tools/perf/util/bpf_skel/sample_filter.bpf.c | 39 +++++++-------
 3 files changed, 78 insertions(+), 43 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index 04f98b6bb291..2510832d83f9 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -93,71 +93,102 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr *
 
 int perf_bpf_filter__prepare(struct evsel *evsel)
 {
-	int i, x, y, fd;
+	int i, x, y, fd, ret;
 	struct sample_filter_bpf *skel;
 	struct bpf_program *prog;
 	struct bpf_link *link;
 	struct perf_bpf_filter_expr *expr;
+	struct perf_bpf_filter_entry *entry;
+
+	entry = calloc(MAX_FILTERS, sizeof(*entry));
+	if (entry == NULL)
+		return -1;
 
 	skel = sample_filter_bpf__open_and_load();
 	if (!skel) {
 		pr_err("Failed to load perf sample-filter BPF skeleton\n");
-		return -1;
+		ret = -EPERM;
+		goto err;
 	}
 
 	i = 0;
 	fd = bpf_map__fd(skel->maps.filters);
 	list_for_each_entry(expr, &evsel->bpf_filters, list) {
-		struct perf_bpf_filter_entry entry = {
-			.op = expr->op,
-			.part = expr->part,
-			.term = expr->term,
-			.value = expr->val,
-		};
+		if (check_sample_flags(evsel, expr) < 0) {
+			ret = -EINVAL;
+			goto err;
+		}
 
-		if (check_sample_flags(evsel, expr) < 0)
-			return -1;
+		if (i == MAX_FILTERS) {
+			ret = -E2BIG;
+			goto err;
+		}
 
-		bpf_map_update_elem(fd, &i, &entry, BPF_ANY);
+		entry[i].op = expr->op;
+		entry[i].part = expr->part;
+		entry[i].term = expr->term;
+		entry[i].value = expr->val;
 		i++;
 
 		if (expr->op == PBF_OP_GROUP_BEGIN) {
 			struct perf_bpf_filter_expr *group;
 
 			list_for_each_entry(group, &expr->groups, list) {
-				struct perf_bpf_filter_entry group_entry = {
-					.op = group->op,
-					.part = group->part,
-					.term = group->term,
-					.value = group->val,
-				};
-				bpf_map_update_elem(fd, &i, &group_entry, BPF_ANY);
+				if (i == MAX_FILTERS) {
+					ret = -E2BIG;
+					goto err;
+				}
+
+				entry[i].op = group->op;
+				entry[i].part = group->part;
+				entry[i].term = group->term;
+				entry[i].value = group->val;
 				i++;
 			}
 
-			memset(&entry, 0, sizeof(entry));
-			entry.op = PBF_OP_GROUP_END;
-			bpf_map_update_elem(fd, &i, &entry, BPF_ANY);
+			if (i == MAX_FILTERS) {
+				ret = -E2BIG;
+				goto err;
+			}
+
+			entry[i].op = PBF_OP_GROUP_END;
 			i++;
 		}
 	}
 
-	if (i > MAX_FILTERS) {
-		pr_err("Too many filters: %d (max = %d)\n", i, MAX_FILTERS);
-		return -1;
+	if (i < MAX_FILTERS) {
+		/* to terminate the loop early */
+		entry[i].op = PBF_OP_DONE;
+		i++;
+	}
+
+	/* The filters map has only one entry for now */
+	i = 0;
+	if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) {
+		ret = -errno;
+		pr_err("Failed to update the filter map\n");
+		goto err;
 	}
+
 	prog = skel->progs.perf_sample_filter;
 	for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) {
 		for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) {
 			link = bpf_program__attach_perf_event(prog, FD(evsel, x, y));
 			if (IS_ERR(link)) {
 				pr_err("Failed to attach perf sample-filter program\n");
-				return PTR_ERR(link);
+				ret = PTR_ERR(link);
+				goto err;
 			}
 		}
 	}
+	free(entry);
 	evsel->bpf_skel = skel;
 	return 0;
+
+err:
+	free(entry);
+	sample_filter_bpf__destroy(skel);
+	return ret;
 }
 
 int perf_bpf_filter__destroy(struct evsel *evsel)
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
index 350efa121026..bb6a1b91f1df 100644
--- a/tools/perf/util/bpf_skel/sample-filter.h
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -14,6 +14,7 @@ enum perf_bpf_filter_op {
 	PBF_OP_AND,
 	PBF_OP_GROUP_BEGIN,
 	PBF_OP_GROUP_END,
+	PBF_OP_DONE,
 };
 
 enum perf_bpf_filter_term {
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index f59985101973..0d56e52b922c 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -9,10 +9,10 @@
 
 /* BPF map that will be filled by user space */
 struct filters {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, int);
-	__type(value, struct perf_bpf_filter_entry);
-	__uint(max_entries, MAX_FILTERS);
+	__type(value, struct perf_bpf_filter_entry[MAX_FILTERS]);
+	__uint(max_entries, 1);
 } filters SEC(".maps");
 
 int dropped;
@@ -179,39 +179,39 @@ int perf_sample_filter(void *ctx)
 	__u64 sample_data;
 	int in_group = 0;
 	int group_result = 0;
-	int i;
+	int i, k;
 
 	kctx = bpf_cast_to_kern_ctx(ctx);
 
-	for (i = 0; i < MAX_FILTERS; i++) {
-		int key = i; /* needed for verifier :( */
+	k = 0;
+	entry = bpf_map_lookup_elem(&filters, &k);
+	if (entry == NULL)
+		goto drop;
 
-		entry = bpf_map_lookup_elem(&filters, &key);
-		if (entry == NULL)
-			break;
-		sample_data = perf_get_sample(kctx, entry);
+	for (i = 0; i < MAX_FILTERS; i++) {
+		sample_data = perf_get_sample(kctx, &entry[i]);
 
-		switch (entry->op) {
+		switch (entry[i].op) {
 		case PBF_OP_EQ:
-			CHECK_RESULT(sample_data, ==, entry->value)
+			CHECK_RESULT(sample_data, ==, entry[i].value)
 			break;
 		case PBF_OP_NEQ:
-			CHECK_RESULT(sample_data, !=, entry->value)
+			CHECK_RESULT(sample_data, !=, entry[i].value)
 			break;
 		case PBF_OP_GT:
-			CHECK_RESULT(sample_data, >, entry->value)
+			CHECK_RESULT(sample_data, >, entry[i].value)
 			break;
 		case PBF_OP_GE:
-			CHECK_RESULT(sample_data, >=, entry->value)
+			CHECK_RESULT(sample_data, >=, entry[i].value)
 			break;
 		case PBF_OP_LT:
-			CHECK_RESULT(sample_data, <, entry->value)
+			CHECK_RESULT(sample_data, <, entry[i].value)
 			break;
 		case PBF_OP_LE:
-			CHECK_RESULT(sample_data, <=, entry->value)
+			CHECK_RESULT(sample_data, <=, entry[i].value)
 			break;
 		case PBF_OP_AND:
-			CHECK_RESULT(sample_data, &, entry->value)
+			CHECK_RESULT(sample_data, &, entry[i].value)
 			break;
 		case PBF_OP_GROUP_BEGIN:
 			in_group = 1;
@@ -222,6 +222,9 @@ int perf_sample_filter(void *ctx)
 				goto drop;
 			in_group = 0;
 			break;
+		case PBF_OP_DONE:
+			/* no failures so far, accept it */
+			return 1;
 		}
 	}
 	/* generate sample data */
-- 
cgit v1.2.3


From 966854e72f6e8a259609ea3c7fd78215e6606c7b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Jul 2024 15:30:29 -0700
Subject: perf bpf-filter: Pass 'target' to perf_bpf_filter__prepare()

This is needed to prepare target-specific actions in the later patch.
We want to reuse the pinned BPF program and map for regular users to
profile their own processes.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240703223035.2024586-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c  | 2 +-
 tools/perf/builtin-stat.c    | 2 +-
 tools/perf/builtin-top.c     | 2 +-
 tools/perf/builtin-trace.c   | 2 +-
 tools/perf/util/bpf-filter.c | 2 +-
 tools/perf/util/bpf-filter.h | 6 ++++--
 tools/perf/util/evlist.c     | 5 +++--
 tools/perf/util/evlist.h     | 4 +++-
 8 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a94516e8c522..c599e620ee89 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1389,7 +1389,7 @@ try_again:
 "even with a suitable vmlinux or kallsyms file.\n\n");
 	}
 
-	if (evlist__apply_filters(evlist, &pos)) {
+	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
 			pos->filter ?: "BPF", evsel__name(pos), errno,
 			str_error_r(errno, msg, sizeof(msg)));
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 661832756a24..1f92445f7480 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -833,7 +833,7 @@ try_again_reset:
 			return -1;
 	}
 
-	if (evlist__apply_filters(evsel_list, &counter)) {
+	if (evlist__apply_filters(evsel_list, &counter, &target)) {
 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
 			counter->filter, evsel__name(counter), errno,
 			str_error_r(errno, msg, sizeof(msg)));
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index e8cbbf10d361..d1a06a88d693 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1055,7 +1055,7 @@ try_again:
 		}
 	}
 
-	if (evlist__apply_filters(evlist, &counter)) {
+	if (evlist__apply_filters(evlist, &counter, &opts->target)) {
 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
 			counter->filter ?: "BPF", evsel__name(counter), errno,
 			str_error_r(errno, msg, sizeof(msg)));
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 488c2cedc110..ef951ce1a0dd 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -4135,7 +4135,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	err = trace__expand_filters(trace, &evsel);
 	if (err)
 		goto out_delete_evlist;
-	err = evlist__apply_filters(evlist, &evsel);
+	err = evlist__apply_filters(evlist, &evsel, &trace->opts.target);
 	if (err < 0)
 		goto out_error_apply_filters;
 
diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index 2510832d83f9..0b2eca56aa10 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -91,7 +91,7 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr *
 	return -1;
 }
 
-int perf_bpf_filter__prepare(struct evsel *evsel)
+int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_unused)
 {
 	int i, x, y, fd, ret;
 	struct sample_filter_bpf *skel;
diff --git a/tools/perf/util/bpf-filter.h b/tools/perf/util/bpf-filter.h
index cd6764442c16..605a3d0226e0 100644
--- a/tools/perf/util/bpf-filter.h
+++ b/tools/perf/util/bpf-filter.h
@@ -16,6 +16,7 @@ struct perf_bpf_filter_expr {
 };
 
 struct evsel;
+struct target;
 
 #ifdef HAVE_BPF_SKEL
 struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term term,
@@ -23,7 +24,7 @@ struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term
 						       enum perf_bpf_filter_op op,
 						       unsigned long val);
 int perf_bpf_filter__parse(struct list_head *expr_head, const char *str);
-int perf_bpf_filter__prepare(struct evsel *evsel);
+int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target);
 int perf_bpf_filter__destroy(struct evsel *evsel);
 u64 perf_bpf_filter__lost_count(struct evsel *evsel);
 
@@ -34,7 +35,8 @@ static inline int perf_bpf_filter__parse(struct list_head *expr_head __maybe_unu
 {
 	return -EOPNOTSUPP;
 }
-static inline int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused)
+static inline int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused,
+					   struct target *target __maybe_unused)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 3a719edafc7a..1417f9a23083 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1086,7 +1086,8 @@ out_delete_threads:
 	return -1;
 }
 
-int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel)
+int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel,
+			  struct target *target)
 {
 	struct evsel *evsel;
 	int err = 0;
@@ -1108,7 +1109,7 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel)
 		 * non-tracepoint events can have BPF filters.
 		 */
 		if (!list_empty(&evsel->bpf_filters)) {
-			err = perf_bpf_filter__prepare(evsel);
+			err = perf_bpf_filter__prepare(evsel, target);
 			if (err) {
 				*err_evsel = evsel;
 				break;
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index cb91dc9117a2..cccc34da5a02 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -20,6 +20,7 @@ struct pollfd;
 struct thread_map;
 struct perf_cpu_map;
 struct record_opts;
+struct target;
 
 /*
  * State machine of bkw_mmap_state:
@@ -212,7 +213,8 @@ void evlist__enable_non_dummy(struct evlist *evlist);
 void evlist__set_selected(struct evlist *evlist, struct evsel *evsel);
 
 int evlist__create_maps(struct evlist *evlist, struct target *target);
-int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel);
+int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel,
+			  struct target *target);
 
 u64 __evlist__combined_sample_type(struct evlist *evlist);
 u64 evlist__combined_sample_type(struct evlist *evlist);
-- 
cgit v1.2.3


From eb1693b1150d4b99af5241242d6bcd6290cf68a0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Jul 2024 15:30:30 -0700
Subject: perf bpf-filter: Split per-task filter use case

If the target is a list of tasks, it can use a shared hash map for
filter expressions.  The key of the filter map is an integer index like
in an array.  A separate pid_hash map is added to get the index for the
filter map using the tgid.

For system-wide mode including per-cpu or per-user targets are handled
by the single entry map like before.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240703223035.2024586-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-filter.c                 | 186 +++++++++++++++++++++------
 tools/perf/util/bpf_skel/sample-filter.h     |   1 +
 tools/perf/util/bpf_skel/sample_filter.bpf.c |  21 +++
 3 files changed, 168 insertions(+), 40 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index 0b2eca56aa10..5ec0e0955ec4 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -3,10 +3,13 @@
 
 #include <bpf/bpf.h>
 #include <linux/err.h>
+#include <api/fs/fs.h>
 #include <internal/xyarray.h>
+#include <perf/threadmap.h>
 
 #include "util/debug.h"
 #include "util/evsel.h"
+#include "util/target.h"
 
 #include "util/bpf-filter.h"
 #include <util/bpf-filter-flex.h>
@@ -91,38 +94,17 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr *
 	return -1;
 }
 
-int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_unused)
+static int get_filter_entries(struct evsel *evsel, struct perf_bpf_filter_entry *entry)
 {
-	int i, x, y, fd, ret;
-	struct sample_filter_bpf *skel;
-	struct bpf_program *prog;
-	struct bpf_link *link;
+	int i = 0;
 	struct perf_bpf_filter_expr *expr;
-	struct perf_bpf_filter_entry *entry;
-
-	entry = calloc(MAX_FILTERS, sizeof(*entry));
-	if (entry == NULL)
-		return -1;
-
-	skel = sample_filter_bpf__open_and_load();
-	if (!skel) {
-		pr_err("Failed to load perf sample-filter BPF skeleton\n");
-		ret = -EPERM;
-		goto err;
-	}
 
-	i = 0;
-	fd = bpf_map__fd(skel->maps.filters);
 	list_for_each_entry(expr, &evsel->bpf_filters, list) {
-		if (check_sample_flags(evsel, expr) < 0) {
-			ret = -EINVAL;
-			goto err;
-		}
+		if (check_sample_flags(evsel, expr) < 0)
+			return -EINVAL;
 
-		if (i == MAX_FILTERS) {
-			ret = -E2BIG;
-			goto err;
-		}
+		if (i == MAX_FILTERS)
+			return -E2BIG;
 
 		entry[i].op = expr->op;
 		entry[i].part = expr->part;
@@ -134,10 +116,8 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_
 			struct perf_bpf_filter_expr *group;
 
 			list_for_each_entry(group, &expr->groups, list) {
-				if (i == MAX_FILTERS) {
-					ret = -E2BIG;
-					goto err;
-				}
+				if (i == MAX_FILTERS)
+					return -E2BIG;
 
 				entry[i].op = group->op;
 				entry[i].part = group->part;
@@ -146,10 +126,8 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_
 				i++;
 			}
 
-			if (i == MAX_FILTERS) {
-				ret = -E2BIG;
-				goto err;
-			}
+			if (i == MAX_FILTERS)
+				return -E2BIG;
 
 			entry[i].op = PBF_OP_GROUP_END;
 			i++;
@@ -161,15 +139,143 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_
 		entry[i].op = PBF_OP_DONE;
 		i++;
 	}
+	return 0;
+}
+
+static int convert_to_tgid(int tid)
+{
+	char path[128];
+	char *buf, *p, *q;
+	int tgid;
+	size_t len;
+
+	scnprintf(path, sizeof(path), "%d/status", tid);
+	if (procfs__read_str(path, &buf, &len) < 0)
+		return -1;
 
-	/* The filters map has only one entry for now */
-	i = 0;
-	if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) {
-		ret = -errno;
-		pr_err("Failed to update the filter map\n");
+	p = strstr(buf, "Tgid:");
+	if (p == NULL) {
+		free(buf);
+		return -1;
+	}
+
+	tgid = strtol(p + 6, &q, 0);
+	free(buf);
+	if (*q != '\n')
+		return -1;
+
+	return tgid;
+}
+
+static int update_pid_hash(struct sample_filter_bpf *skel, struct evsel *evsel,
+			   struct perf_bpf_filter_entry *entry)
+{
+	int filter_idx;
+	int nr, last;
+	int fd = bpf_map__fd(skel->maps.filters);
+	struct perf_thread_map *threads;
+
+	/* Find the first available entry in the filters map */
+	for (filter_idx = 0; filter_idx < MAX_FILTERS; filter_idx++) {
+		if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0)
+			break;
+	}
+
+	if (filter_idx == MAX_FILTERS) {
+		pr_err("Too many users for the filter map\n");
+		return -EBUSY;
+	}
+
+	threads = perf_evsel__threads(&evsel->core);
+	if (threads == NULL) {
+		pr_err("Cannot get the thread list of the event\n");
+		return -EINVAL;
+	}
+
+	/* save the index to a hash map */
+	fd = bpf_map__fd(skel->maps.pid_hash);
+
+	last = -1;
+	nr = perf_thread_map__nr(threads);
+	for (int i = 0; i < nr; i++) {
+		int pid = perf_thread_map__pid(threads, i);
+		int tgid;
+
+		/* it actually needs tgid, let's get tgid from /proc. */
+		tgid = convert_to_tgid(pid);
+		if (tgid < 0) {
+			/* the thread may be dead, ignore. */
+			continue;
+		}
+
+		if (tgid == last)
+			continue;
+		last = tgid;
+
+		if (bpf_map_update_elem(fd, &tgid, &filter_idx, BPF_ANY) < 0) {
+			pr_err("Failed to update the pid hash\n");
+			return -errno;
+		}
+		pr_debug("pid hash: %d -> %d\n", tgid, filter_idx);
+	}
+	return 0;
+}
+
+int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
+{
+	int i, x, y, fd, ret;
+	struct sample_filter_bpf *skel = NULL;
+	struct bpf_program *prog;
+	struct bpf_link *link;
+	struct perf_bpf_filter_entry *entry;
+	bool needs_pid_hash = !target__has_cpu(target) && !target->uid_str;
+
+	entry = calloc(MAX_FILTERS, sizeof(*entry));
+	if (entry == NULL)
+		return -1;
+
+	ret = get_filter_entries(evsel, entry);
+	if (ret < 0) {
+		pr_err("Failed to process filter entries\n");
+		goto err;
+	}
+
+	skel = sample_filter_bpf__open();
+	if (!skel) {
+		pr_err("Failed to open perf sample-filter BPF skeleton\n");
+		ret = -EPERM;
 		goto err;
 	}
 
+	if (needs_pid_hash) {
+		bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS);
+		bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS);
+		skel->rodata->use_pid_hash = 1;
+	}
+
+	if (sample_filter_bpf__load(skel) < 0) {
+		pr_err("Failed to load perf sample-filter BPF skeleton\n");
+		ret = -EPERM;
+		goto err;
+	}
+
+	if (needs_pid_hash) {
+		/* The filters map is shared among other processes  */
+		ret = update_pid_hash(skel, evsel, entry);
+		if (ret < 0)
+			goto err;
+	} else {
+		i = 0;
+		fd = bpf_map__fd(skel->maps.filters);
+
+		/* The filters map has only one entry in this case */
+		if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) {
+			ret = -errno;
+			pr_err("Failed to update the filter map\n");
+			goto err;
+		}
+	}
+
 	prog = skel->progs.perf_sample_filter;
 	for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) {
 		for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) {
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
index bb6a1b91f1df..e666bfd5fbdd 100644
--- a/tools/perf/util/bpf_skel/sample-filter.h
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -2,6 +2,7 @@
 #define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
 
 #define MAX_FILTERS  64
+#define MAX_PIDS     (16 * 1024)
 
 /* supported filter operations */
 enum perf_bpf_filter_op {
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index 0d56e52b922c..c5273f06fa45 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -15,7 +15,16 @@ struct filters {
 	__uint(max_entries, 1);
 } filters SEC(".maps");
 
+/* tgid to filter index */
+struct pid_hash {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, int);
+	__type(value, int);
+	__uint(max_entries, 1);
+} pid_hash SEC(".maps");
+
 int dropped;
+volatile const int use_pid_hash;
 
 void *bpf_cast_to_kern_ctx(void *) __ksym;
 
@@ -184,6 +193,18 @@ int perf_sample_filter(void *ctx)
 	kctx = bpf_cast_to_kern_ctx(ctx);
 
 	k = 0;
+
+	if (use_pid_hash) {
+		int tgid = bpf_get_current_pid_tgid() >> 32;
+		int *idx;
+
+		idx = bpf_map_lookup_elem(&pid_hash, &tgid);
+		if (idx)
+			k = *idx;
+		else
+			goto drop;
+	}
+
 	entry = bpf_map_lookup_elem(&filters, &k);
 	if (entry == NULL)
 		goto drop;
-- 
cgit v1.2.3


From 0715f65e94377016e8e12dae6bb6a6d24644c54d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Jul 2024 15:30:31 -0700
Subject: perf bpf-filter: Support pin/unpin BPF object

And use the pinned objects for unprivileged users to profile their own
tasks.  The BPF objects need to be pinned in the BPF-fs by root first
and it'll be handled in the later patch.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240703223035.2024586-5-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-filter.c | 230 ++++++++++++++++++++++++++++++++++++-------
 tools/perf/util/bpf-filter.h |  13 +++
 2 files changed, 209 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index 5ec0e0955ec4..37ed6c48debf 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -1,5 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <stdlib.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
 
 #include <bpf/bpf.h>
 #include <linux/err.h>
@@ -23,6 +26,9 @@
 #define __PERF_SAMPLE_TYPE(tt, st, opt)	{ tt, #st, opt }
 #define PERF_SAMPLE_TYPE(_st, opt)	__PERF_SAMPLE_TYPE(PBF_TERM_##_st, PERF_SAMPLE_##_st, opt)
 
+/* Index in the pinned 'filters' map.  Should be released after use. */
+static int pinned_filter_idx = -1;
+
 static const struct perf_sample_info {
 	enum perf_bpf_filter_term type;
 	const char *name;
@@ -47,6 +53,8 @@ static const struct perf_sample_info {
 	PERF_SAMPLE_TYPE(DATA_PAGE_SIZE, "--data-page-size"),
 };
 
+static int get_pinned_fd(const char *name);
+
 static const struct perf_sample_info *get_sample_info(enum perf_bpf_filter_term type)
 {
 	size_t i;
@@ -167,19 +175,26 @@ static int convert_to_tgid(int tid)
 	return tgid;
 }
 
-static int update_pid_hash(struct sample_filter_bpf *skel, struct evsel *evsel,
-			   struct perf_bpf_filter_entry *entry)
+static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *entry)
 {
 	int filter_idx;
-	int nr, last;
-	int fd = bpf_map__fd(skel->maps.filters);
+	int fd, nr, last;
 	struct perf_thread_map *threads;
 
+	fd = get_pinned_fd("filters");
+	if (fd < 0) {
+		pr_debug("cannot get fd for 'filters' map\n");
+		return fd;
+	}
+
 	/* Find the first available entry in the filters map */
 	for (filter_idx = 0; filter_idx < MAX_FILTERS; filter_idx++) {
-		if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0)
+		if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0) {
+			pinned_filter_idx = filter_idx;
 			break;
+		}
 	}
+	close(fd);
 
 	if (filter_idx == MAX_FILTERS) {
 		pr_err("Too many users for the filter map\n");
@@ -193,7 +208,9 @@ static int update_pid_hash(struct sample_filter_bpf *skel, struct evsel *evsel,
 	}
 
 	/* save the index to a hash map */
-	fd = bpf_map__fd(skel->maps.pid_hash);
+	fd = get_pinned_fd("pid_hash");
+	if (fd < 0)
+		return fd;
 
 	last = -1;
 	nr = perf_thread_map__nr(threads);
@@ -214,10 +231,12 @@ static int update_pid_hash(struct sample_filter_bpf *skel, struct evsel *evsel,
 
 		if (bpf_map_update_elem(fd, &tgid, &filter_idx, BPF_ANY) < 0) {
 			pr_err("Failed to update the pid hash\n");
-			return -errno;
+			close(fd);
+			return -1;
 		}
 		pr_debug("pid hash: %d -> %d\n", tgid, filter_idx);
 	}
+	close(fd);
 	return 0;
 }
 
@@ -240,40 +259,48 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
 		goto err;
 	}
 
-	skel = sample_filter_bpf__open();
-	if (!skel) {
-		pr_err("Failed to open perf sample-filter BPF skeleton\n");
-		ret = -EPERM;
-		goto err;
-	}
+	if (needs_pid_hash && geteuid() != 0) {
+		/* The filters map is shared among other processes */
+		ret = update_pid_hash(evsel, entry);
+		if (ret < 0)
+			goto err;
 
-	if (needs_pid_hash) {
-		bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS);
-		bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS);
-		skel->rodata->use_pid_hash = 1;
+		fd = get_pinned_fd("perf_sample_filter");
+		if (fd < 0) {
+			ret = fd;
+			goto err;
+		}
+
+		for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) {
+			for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) {
+				ret = ioctl(FD(evsel, x, y), PERF_EVENT_IOC_SET_BPF, fd);
+				if (ret < 0) {
+					pr_err("Failed to attach perf sample-filter\n");
+					goto err;
+				}
+			}
+		}
+
+		close(fd);
+		free(entry);
+		return 0;
 	}
 
-	if (sample_filter_bpf__load(skel) < 0) {
+	skel = sample_filter_bpf__open_and_load();
+	if (!skel) {
+		ret = -errno;
 		pr_err("Failed to load perf sample-filter BPF skeleton\n");
-		ret = -EPERM;
 		goto err;
 	}
 
-	if (needs_pid_hash) {
-		/* The filters map is shared among other processes  */
-		ret = update_pid_hash(skel, evsel, entry);
-		if (ret < 0)
-			goto err;
-	} else {
-		i = 0;
-		fd = bpf_map__fd(skel->maps.filters);
-
-		/* The filters map has only one entry in this case */
-		if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) {
-			ret = -errno;
-			pr_err("Failed to update the filter map\n");
-			goto err;
-		}
+	i = 0;
+	fd = bpf_map__fd(skel->maps.filters);
+
+	/* The filters map has only one entry in this case */
+	if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) {
+		ret = -errno;
+		pr_err("Failed to update the filter map\n");
+		goto err;
 	}
 
 	prog = skel->progs.perf_sample_filter;
@@ -306,6 +333,15 @@ int perf_bpf_filter__destroy(struct evsel *evsel)
 		free(expr);
 	}
 	sample_filter_bpf__destroy(evsel->bpf_skel);
+
+	if (pinned_filter_idx >= 0) {
+		int fd = get_pinned_fd("filters");
+
+		bpf_map_delete_elem(fd, &pinned_filter_idx);
+		pinned_filter_idx = -1;
+		close(fd);
+	}
+
 	return 0;
 }
 
@@ -349,3 +385,129 @@ int perf_bpf_filter__parse(struct list_head *expr_head, const char *str)
 
 	return ret;
 }
+
+int perf_bpf_filter__pin(void)
+{
+	struct sample_filter_bpf *skel;
+	char *path = NULL;
+	int dir_fd, ret = -1;
+
+	skel = sample_filter_bpf__open();
+	if (!skel) {
+		ret = -errno;
+		pr_err("Failed to open perf sample-filter BPF skeleton\n");
+		goto err;
+	}
+
+	/* pinned program will use pid-hash */
+	bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS);
+	bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS);
+	skel->rodata->use_pid_hash = 1;
+
+	if (sample_filter_bpf__load(skel) < 0) {
+		ret = -errno;
+		pr_err("Failed to load perf sample-filter BPF skeleton\n");
+		goto err;
+	}
+
+	if (asprintf(&path, "%s/fs/bpf/%s", sysfs__mountpoint(),
+		     PERF_BPF_FILTER_PIN_PATH) < 0) {
+		ret = -errno;
+		pr_err("Failed to allocate pathname in the BPF-fs\n");
+		goto err;
+	}
+
+	ret = bpf_object__pin(skel->obj, path);
+	if (ret < 0) {
+		pr_err("Failed to pin BPF filter objects\n");
+		goto err;
+	}
+
+	/* setup access permissions for the pinned objects */
+	dir_fd = open(path, O_PATH);
+	if (dir_fd < 0) {
+		bpf_object__unpin(skel->obj, path);
+		ret = dir_fd;
+		goto err;
+	}
+
+	/* BPF-fs root has the sticky bit */
+	if (fchmodat(dir_fd, "..", 01755, 0) < 0) {
+		pr_debug("chmod for BPF-fs failed\n");
+		ret = -errno;
+		goto err_close;
+	}
+
+	/* perf_filter directory */
+	if (fchmodat(dir_fd, ".", 0755, 0) < 0) {
+		pr_debug("chmod for perf_filter directory failed?\n");
+		ret = -errno;
+		goto err_close;
+	}
+
+	/* programs need write permission for some reason */
+	if (fchmodat(dir_fd, "perf_sample_filter", 0777, 0) < 0) {
+		pr_debug("chmod for perf_sample_filter failed\n");
+		ret = -errno;
+	}
+	/* maps */
+	if (fchmodat(dir_fd, "filters", 0666, 0) < 0) {
+		pr_debug("chmod for filters failed\n");
+		ret = -errno;
+	}
+	if (fchmodat(dir_fd, "pid_hash", 0666, 0) < 0) {
+		pr_debug("chmod for pid_hash failed\n");
+		ret = -errno;
+	}
+
+err_close:
+	close(dir_fd);
+
+err:
+	free(path);
+	sample_filter_bpf__destroy(skel);
+	return ret;
+}
+
+int perf_bpf_filter__unpin(void)
+{
+	struct sample_filter_bpf *skel;
+	char *path = NULL;
+	int ret = -1;
+
+	skel = sample_filter_bpf__open_and_load();
+	if (!skel) {
+		ret = -errno;
+		pr_err("Failed to open perf sample-filter BPF skeleton\n");
+		goto err;
+	}
+
+	if (asprintf(&path, "%s/fs/bpf/%s", sysfs__mountpoint(),
+		     PERF_BPF_FILTER_PIN_PATH) < 0) {
+		ret = -errno;
+		pr_err("Failed to allocate pathname in the BPF-fs\n");
+		goto err;
+	}
+
+	ret = bpf_object__unpin(skel->obj, path);
+
+err:
+	free(path);
+	sample_filter_bpf__destroy(skel);
+	return ret;
+}
+
+static int get_pinned_fd(const char *name)
+{
+	char *path = NULL;
+	int fd;
+
+	if (asprintf(&path, "%s/fs/bpf/%s/%s", sysfs__mountpoint(),
+		     PERF_BPF_FILTER_PIN_PATH, name) < 0)
+		return -1;
+
+	fd = bpf_obj_get(path);
+
+	free(path);
+	return fd;
+}
diff --git a/tools/perf/util/bpf-filter.h b/tools/perf/util/bpf-filter.h
index 605a3d0226e0..916ed7770b73 100644
--- a/tools/perf/util/bpf-filter.h
+++ b/tools/perf/util/bpf-filter.h
@@ -18,6 +18,9 @@ struct perf_bpf_filter_expr {
 struct evsel;
 struct target;
 
+/* path in BPF-fs for the pinned program and maps */
+#define PERF_BPF_FILTER_PIN_PATH  "perf_filter"
+
 #ifdef HAVE_BPF_SKEL
 struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term term,
 						       int part,
@@ -27,6 +30,8 @@ int perf_bpf_filter__parse(struct list_head *expr_head, const char *str);
 int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target);
 int perf_bpf_filter__destroy(struct evsel *evsel);
 u64 perf_bpf_filter__lost_count(struct evsel *evsel);
+int perf_bpf_filter__pin(void);
+int perf_bpf_filter__unpin(void);
 
 #else /* !HAVE_BPF_SKEL */
 
@@ -48,5 +53,13 @@ static inline u64 perf_bpf_filter__lost_count(struct evsel *evsel __maybe_unused
 {
 	return 0;
 }
+static inline int perf_bpf_filter__pin(void)
+{
+	return -EOPNOTSUPP;
+}
+static inline int perf_bpf_filter__unpin(void)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* HAVE_BPF_SKEL*/
 #endif /* PERF_UTIL_BPF_FILTER_H */
-- 
cgit v1.2.3


From 1ec6fd34e0572588b9628e397ff134fd3cf79b5d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Jul 2024 15:30:32 -0700
Subject: perf bpf-filter: Support separate lost counts for each filter

As the BPF filter is shared between other processes, it should have its
own counter for each invocation.  Add a new array map (lost_count) to
save the count using the same index as the filter.  It should clear the
count before running the filter.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240703223035.2024586-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-filter.c                 | 37 ++++++++++++++++++++++++++--
 tools/perf/util/bpf_skel/sample_filter.bpf.c | 15 +++++++++--
 2 files changed, 48 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index 37ed6c48debf..c5eb0b7eec19 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -260,11 +260,23 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
 	}
 
 	if (needs_pid_hash && geteuid() != 0) {
+		int zero = 0;
+
 		/* The filters map is shared among other processes */
 		ret = update_pid_hash(evsel, entry);
 		if (ret < 0)
 			goto err;
 
+		fd = get_pinned_fd("dropped");
+		if (fd < 0) {
+			ret = fd;
+			goto err;
+		}
+
+		/* Reset the lost count */
+		bpf_map_update_elem(fd, &pinned_filter_idx, &zero, BPF_ANY);
+		close(fd);
+
 		fd = get_pinned_fd("perf_sample_filter");
 		if (fd < 0) {
 			ret = fd;
@@ -347,9 +359,25 @@ int perf_bpf_filter__destroy(struct evsel *evsel)
 
 u64 perf_bpf_filter__lost_count(struct evsel *evsel)
 {
-	struct sample_filter_bpf *skel = evsel->bpf_skel;
+	int count = 0;
+
+	if (list_empty(&evsel->bpf_filters))
+		return 0;
+
+	if (pinned_filter_idx >= 0) {
+		int fd = get_pinned_fd("dropped");
+
+		bpf_map_lookup_elem(fd, &pinned_filter_idx, &count);
+		close(fd);
+	} else if (evsel->bpf_skel) {
+		struct sample_filter_bpf *skel = evsel->bpf_skel;
+		int fd = bpf_map__fd(skel->maps.dropped);
+		int idx = 0;
 
-	return skel ? skel->bss->dropped : 0;
+		bpf_map_lookup_elem(fd, &idx, &count);
+	}
+
+	return count;
 }
 
 struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term term,
@@ -402,6 +430,7 @@ int perf_bpf_filter__pin(void)
 	/* pinned program will use pid-hash */
 	bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS);
 	bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS);
+	bpf_map__set_max_entries(skel->maps.dropped, MAX_FILTERS);
 	skel->rodata->use_pid_hash = 1;
 
 	if (sample_filter_bpf__load(skel) < 0) {
@@ -459,6 +488,10 @@ int perf_bpf_filter__pin(void)
 		pr_debug("chmod for pid_hash failed\n");
 		ret = -errno;
 	}
+	if (fchmodat(dir_fd, "dropped", 0666, 0) < 0) {
+		pr_debug("chmod for dropped failed\n");
+		ret = -errno;
+	}
 
 err_close:
 	close(dir_fd);
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index c5273f06fa45..4c75354b84fd 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -23,7 +23,14 @@ struct pid_hash {
 	__uint(max_entries, 1);
 } pid_hash SEC(".maps");
 
-int dropped;
+/* tgid to filter index */
+struct lost_count {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, int);
+	__uint(max_entries, 1);
+} dropped SEC(".maps");
+
 volatile const int use_pid_hash;
 
 void *bpf_cast_to_kern_ctx(void *) __ksym;
@@ -189,6 +196,7 @@ int perf_sample_filter(void *ctx)
 	int in_group = 0;
 	int group_result = 0;
 	int i, k;
+	int *losts;
 
 	kctx = bpf_cast_to_kern_ctx(ctx);
 
@@ -252,7 +260,10 @@ int perf_sample_filter(void *ctx)
 	return 1;
 
 drop:
-	__sync_fetch_and_add(&dropped, 1);
+	losts = bpf_map_lookup_elem(&dropped, &k);
+	if (losts != NULL)
+		__sync_fetch_and_add(losts, 1);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 73bf63a4750ea18d7fbb8f80695dcfd0656d13f4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Jul 2024 15:30:33 -0700
Subject: perf record: Fix a potential error handling issue

The evlist is allocated at the beginning of cmd_record().  Also free-ing
thread masks should be paired with record__init_thread_masks() which is
called right before __cmd_record().

Let's change the order of these functions to release the resources
correctly in case of errors.  This is maybe fine as the process exits,
but it might be a problem if it manages some system-wide resources that
live longer than the process.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240703223035.2024586-7-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index c599e620ee89..01fb95487a63 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -4242,13 +4242,13 @@ int cmd_record(int argc, const char **argv)
 
 	err = __cmd_record(&record, argc, argv);
 out:
-	evlist__delete(rec->evlist);
+	record__free_thread_masks(rec, rec->nr_threads);
+	rec->nr_threads = 0;
 	symbol__exit();
 	auxtrace_record__free(rec->itr);
 out_opts:
-	record__free_thread_masks(rec, rec->nr_threads);
-	rec->nr_threads = 0;
 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
+	evlist__delete(rec->evlist);
 	return err;
 }
 
-- 
cgit v1.2.3


From 3dee4b83a6b4c167db0827c2b1e288460af72ac6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Jul 2024 15:30:34 -0700
Subject: perf record: Add --setup-filter option

To allow BPF filters for unprivileged users it needs to pin the BPF
objects to BPF-fs first.  Let's add a new option to pin and unpin the
objects easily.  I'm not sure 'perf record' is a right place to do this
but I don't have a better idea right now.

  $ sudo perf record --setup-filter pin

The above command would pin BPF program and maps for the filter when the
system has BPF-fs (usually at /sys/fs/bpf/).  To unpin the objects,
users can run the following command (as root).

  $ sudo perf record --setup-filter unpin

Committer testing:

  root@number:~# perf record --setup-filter pin
  root@number:~# ls -la /sys/fs/bpf/perf_filter/
  total 0
  drwxr-xr-x. 2 root root 0 Jul 31 10:43 .
  drwxr-xr-t. 3 root root 0 Jul 31 10:43 ..
  -rw-rw-rw-. 1 root root 0 Jul 31 10:43 dropped
  -rw-rw-rw-. 1 root root 0 Jul 31 10:43 filters
  -rwxrwxrwx. 1 root root 0 Jul 31 10:43 perf_sample_filter
  -rw-rw-rw-. 1 root root 0 Jul 31 10:43 pid_hash
  -rw-------. 1 root root 0 Jul 31 10:43 sample_f_rodata
  root@number:~# ls -la /sys/fs/bpf/perf_filter/perf_sample_filter
  -rwxrwxrwx. 1 root root 0 Jul 31 10:43 /sys/fs/bpf/perf_filter/perf_sample_filter
  root@number:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240703223035.2024586-8-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-record.txt |  5 +++++
 tools/perf/builtin-record.c              | 15 +++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index d6532ed97c02..41e36b4dc765 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -828,6 +828,11 @@ filtered through the mask provided by -C option.
 	only, as of now.  So the applications built without the frame
 	pointer might see bogus addresses.
 
+--setup-filter=<action>::
+	Prepare BPF filter to be used by regular users.  The action should be
+	either "pin" or "unpin".  The filter can be used after it's pinned.
+
+
 include::intel-hybrid.txt[]
 
 SEE ALSO
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 01fb95487a63..72345d1e54b0 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -171,6 +171,7 @@ struct record {
 	bool			timestamp_filename;
 	bool			timestamp_boundary;
 	bool			off_cpu;
+	const char		*filter_action;
 	struct switch_output	switch_output;
 	unsigned long long	samples;
 	unsigned long		output_max_size;	/* = 0: unlimited */
@@ -3557,6 +3558,8 @@ static struct option __record_options[] = {
 			    "write collected trace data into several data files using parallel threads",
 			    record__parse_threads),
 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
+	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
+		   "BPF filter action"),
 	OPT_END()
 };
 
@@ -4086,6 +4089,18 @@ int cmd_record(int argc, const char **argv)
 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
 	}
 
+	if (rec->filter_action) {
+		if (!strcmp(rec->filter_action, "pin"))
+			err = perf_bpf_filter__pin();
+		else if (!strcmp(rec->filter_action, "unpin"))
+			err = perf_bpf_filter__unpin();
+		else {
+			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
+			err = -EINVAL;
+		}
+		goto out_opts;
+	}
+
 	/*
 	 * Allow aliases to facilitate the lookup of symbols for address
 	 * filters. Refer to auxtrace_parse_filters().
-- 
cgit v1.2.3


From 9cb3549b73c1cd1f8ed0ff32bafab7649e5bf4ea Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Jul 2024 15:30:35 -0700
Subject: perf test: Update sample filtering test

Now it can run the BPF filtering test with normal user if the BPF
objects are pinned by 'sudo perf record --setup-filter pin'.  Let's
update the test case to verify the behavior.  It'll skip the test if the
filter check is failed from a normal user, but it shows a message how to
set up the filters.

First, run the test as a normal user and it fails.

  $ perf test -vv filtering
   95: perf record sample filtering (by BPF) tests:
  --- start ---
  test child forked, pid 425677
  Checking BPF-filter privilege
  try 'sudo perf record --setup-filter pin' first.       <<<--- here
  bpf-filter test [Skipped permission]
  ---- end(-2) ----
   95: perf record sample filtering (by BPF) tests                     : Skip

According to the message, run the perf record command to pin the BPF
objects.

  $ sudo perf record --setup-filter pin

And re-run the test as a normal user.

  $ perf test -vv filtering
   95: perf record sample filtering (by BPF) tests:
  --- start ---
  test child forked, pid 424486
  Checking BPF-filter privilege
  Basic bpf-filter test
  Basic bpf-filter test [Success]
  Failing bpf-filter test
  Error: task-clock event does not have PERF_SAMPLE_CPU
  Failing bpf-filter test [Success]
  Group bpf-filter test
  Error: task-clock event does not have PERF_SAMPLE_CPU
  Error: task-clock event does not have PERF_SAMPLE_CODE_PAGE_SIZE
  Group bpf-filter test [Success]
  ---- end(0) ----
   95: perf record sample filtering (by BPF) tests                     : Ok

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240703223035.2024586-9-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record_bpf_filter.sh | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record_bpf_filter.sh b/tools/perf/tests/shell/record_bpf_filter.sh
index 31c593966e8c..c5882d620db7 100755
--- a/tools/perf/tests/shell/record_bpf_filter.sh
+++ b/tools/perf/tests/shell/record_bpf_filter.sh
@@ -22,15 +22,16 @@ trap trap_cleanup EXIT TERM INT
 test_bpf_filter_priv() {
   echo "Checking BPF-filter privilege"
 
-  if [ "$(id -u)" != 0 ]
-  then
-    echo "bpf-filter test [Skipped permission]"
-    err=2
-    return
-  fi
   if ! perf record -e task-clock --filter 'period > 1' \
 	  -o /dev/null --quiet true 2>&1
   then
+    if [ "$(id -u)" != 0 ]
+    then
+      echo "try 'sudo perf record --setup-filter pin' first."
+      echo "bpf-filter test [Skipped permission]"
+      err=2
+      return
+    fi
     echo "bpf-filter test [Skipped missing BPF support]"
     err=2
     return
-- 
cgit v1.2.3


From 4e51e13bd986cab31dfab5c3bff1678171debbd1 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jul 2024 16:08:03 +0500
Subject: selftests: user: remove user suite

The user test suite has only one test, test_user_copy which loads
test_user_copy module for testing. But test_user_copy module has already
been converted to kunit (see fixes). Hence remove the entire suite.

Fixes: cf6219ee889f ("usercopy: Convert test_user_copy to KUnit test")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/Makefile               |  1 -
 tools/testing/selftests/user/Makefile          |  9 ---------
 tools/testing/selftests/user/config            |  1 -
 tools/testing/selftests/user/test_user_copy.sh | 18 ------------------
 4 files changed, 29 deletions(-)
 delete mode 100644 tools/testing/selftests/user/Makefile
 delete mode 100644 tools/testing/selftests/user/config
 delete mode 100755 tools/testing/selftests/user/test_user_copy.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index bc8fe9e8f7f2..af2429431b6b 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -107,7 +107,6 @@ TARGETS += tmpfs
 TARGETS += tpm2
 TARGETS += tty
 TARGETS += uevent
-TARGETS += user
 TARGETS += user_events
 TARGETS += vDSO
 TARGETS += mm
diff --git a/tools/testing/selftests/user/Makefile b/tools/testing/selftests/user/Makefile
deleted file mode 100644
index 640a40f9b72b..000000000000
--- a/tools/testing/selftests/user/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-# Makefile for user memory selftests
-
-# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
-all:
-
-TEST_PROGS := test_user_copy.sh
-
-include ../lib.mk
diff --git a/tools/testing/selftests/user/config b/tools/testing/selftests/user/config
deleted file mode 100644
index 784ed8416324..000000000000
--- a/tools/testing/selftests/user/config
+++ /dev/null
@@ -1 +0,0 @@
-CONFIG_TEST_USER_COPY=m
diff --git a/tools/testing/selftests/user/test_user_copy.sh b/tools/testing/selftests/user/test_user_copy.sh
deleted file mode 100755
index f9b31a57439b..000000000000
--- a/tools/testing/selftests/user/test_user_copy.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-# Runs copy_to/from_user infrastructure using test_user_copy kernel module
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-if ! /sbin/modprobe -q -n test_user_copy; then
-	echo "user: module test_user_copy is not found [SKIP]"
-	exit $ksft_skip
-fi
-if /sbin/modprobe -q test_user_copy; then
-	/sbin/modprobe -q -r test_user_copy
-	echo "user_copy: ok"
-else
-	echo "user_copy: [FAIL]"
-	exit 1
-fi
-- 
cgit v1.2.3


From 44b045e27c65153cc8f549627d8d6d82f897c03c Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jul 2024 17:11:39 +0500
Subject: selftests: lib: remove strscpy test

The strscpy test loads test_strscpy module for testing. But test_strscpy
was converted to Kunit (see fixes). Hence remove strscpy.

Fixes: 41eefc46a3a4 ("string: Convert strscpy() self-test to KUnit")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/lib/Makefile   | 3 +--
 tools/testing/selftests/lib/config     | 1 -
 tools/testing/selftests/lib/strscpy.sh | 3 ---
 3 files changed, 1 insertion(+), 6 deletions(-)
 delete mode 100755 tools/testing/selftests/lib/strscpy.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile
index ee71fc99d5b5..c52fe3ad8e98 100644
--- a/tools/testing/selftests/lib/Makefile
+++ b/tools/testing/selftests/lib/Makefile
@@ -4,6 +4,5 @@
 # No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
 all:
 
-TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh strscpy.sh
-
+TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh
 include ../lib.mk
diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config
index 645839b50b0a..dc15aba8d0a3 100644
--- a/tools/testing/selftests/lib/config
+++ b/tools/testing/selftests/lib/config
@@ -2,5 +2,4 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_PRIME_NUMBERS=m
-CONFIG_TEST_STRSCPY=m
 CONFIG_TEST_BITOPS=m
diff --git a/tools/testing/selftests/lib/strscpy.sh b/tools/testing/selftests/lib/strscpy.sh
deleted file mode 100755
index be60ef6e1a7f..000000000000
--- a/tools/testing/selftests/lib/strscpy.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0+
-$(dirname $0)/../kselftest/module.sh "strscpy*" test_strscpy
-- 
cgit v1.2.3


From f0a1ffa6f9771c6a1283afab591781e7c797e2e1 Mon Sep 17 00:00:00 2001
From: Abdulrasaq Lawani <abdulrasaqolawani@gmail.com>
Date: Thu, 1 Aug 2024 05:29:52 -0400
Subject: selftest: acct: Add selftest for the acct() syscall

The acct() system call enables or disables process accounting.
If accounting is turned on, records for each terminating process
are appended to a specified filename as it terminates. An argument of NULL
causes accounting to be turned off.

This patch will add a test for the acct() syscall.

Signed-off-by: Abdulrasaq Lawani <abdulrasaqolawani@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/Makefile            |  1 +
 tools/testing/selftests/acct/.gitignore     |  3 ++
 tools/testing/selftests/acct/Makefile       |  5 ++
 tools/testing/selftests/acct/acct_syscall.c | 78 +++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+)
 create mode 100644 tools/testing/selftests/acct/.gitignore
 create mode 100644 tools/testing/selftests/acct/Makefile
 create mode 100644 tools/testing/selftests/acct/acct_syscall.c

(limited to 'tools')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index af2429431b6b..f51fcceb45fd 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
+TARGETS += acct
 TARGETS += alsa
 TARGETS += amd-pstate
 TARGETS += arm64
diff --git a/tools/testing/selftests/acct/.gitignore b/tools/testing/selftests/acct/.gitignore
new file mode 100644
index 000000000000..7e78aac19038
--- /dev/null
+++ b/tools/testing/selftests/acct/.gitignore
@@ -0,0 +1,3 @@
+acct_syscall
+config
+process_log
\ No newline at end of file
diff --git a/tools/testing/selftests/acct/Makefile b/tools/testing/selftests/acct/Makefile
new file mode 100644
index 000000000000..7e025099cf65
--- /dev/null
+++ b/tools/testing/selftests/acct/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := acct_syscall
+CFLAGS += -Wall
+
+include ../lib.mk
\ No newline at end of file
diff --git a/tools/testing/selftests/acct/acct_syscall.c b/tools/testing/selftests/acct/acct_syscall.c
new file mode 100644
index 000000000000..e44e8fe1f4a3
--- /dev/null
+++ b/tools/testing/selftests/acct/acct_syscall.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* kselftest for acct() system call
+ *  The acct() system call enables or disables process accounting.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+
+int main(void)
+{
+	char filename[] = "process_log";
+	FILE *fp;
+	pid_t child_pid;
+	int sz;
+
+	// Setting up kselftest framework
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	// Check if test is run a root
+	if (geteuid()) {
+		ksft_test_result_skip("This test needs root to run!\n");
+		return 1;
+	}
+
+	// Create file to log closed processes
+	fp = fopen(filename, "w");
+
+	if (!fp) {
+		ksft_test_result_error("%s.\n", strerror(errno));
+		ksft_finished();
+		return 1;
+	}
+
+	acct(filename);
+
+	// Handle error conditions
+	if (errno) {
+		ksft_test_result_error("%s.\n", strerror(errno));
+		fclose(fp);
+		ksft_finished();
+		return 1;
+	}
+
+	// Create child process and wait for it to terminate.
+
+	child_pid = fork();
+
+	if (child_pid < 0) {
+		ksft_test_result_error("Creating a child process to log failed\n");
+		acct(NULL);
+		return 1;
+	} else if (child_pid > 0) {
+		wait(NULL);
+		fseek(fp, 0L, SEEK_END);
+		sz = ftell(fp);
+
+		acct(NULL);
+
+		if (sz <= 0) {
+			ksft_test_result_fail("Terminated child process not logged\n");
+			ksft_exit_fail();
+			return 1;
+		}
+
+		ksft_test_result_pass("Successfully logged terminated process.\n");
+		fclose(fp);
+		ksft_exit_pass();
+		return 0;
+	}
+
+	return 1;
+}
-- 
cgit v1.2.3


From ea59b70a8418a313d6f2ab48a957de015fc33018 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 31 Jul 2024 11:58:56 -0300
Subject: perf bpf: Move BPF disassembly routines to separate file to avoid
 clash with capstone bpf headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a clash of the libbpf and capstone libraries, that ends up
with:

  In file included from /usr/include/capstone/capstone.h:325,
                   from util/disasm.c:1513:
  /usr/include/capstone/bpf.h:94:14: error: ‘bpf_insn’ defined as wrong kind of tag
     94 | typedef enum bpf_insn {

So far we're just trying to avoid this by not having both headers
included in the same .c or .h file, do it one more time by moving the
BPF diassembly routines from util/disasm.c to util/disasm_bpf.c.

This is only being hit when building with BUILD_NONDISTRO=1, i.e.
building with binutils-devel, that isn't the in the default build due to
a licencing clash. We need to reimplement what is now isolated in
util/disasm_bpf.c using some other library to have BPF annotation
feature that now only is available with BUILD_NONDISTRO=1.

Fixes: 6d17edc113de1e21 ("perf annotate: Use libcapstone to disassemble")
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZqpUSKPxMwaQKORr@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build        |   1 +
 tools/perf/util/disasm.c     | 187 +----------------------------------------
 tools/perf/util/disasm_bpf.c | 195 +++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/disasm_bpf.h |  12 +++
 4 files changed, 209 insertions(+), 186 deletions(-)
 create mode 100644 tools/perf/util/disasm_bpf.c
 create mode 100644 tools/perf/util/disasm_bpf.h

(limited to 'tools')

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 0f18fe81ef0b..b24360c04aae 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -13,6 +13,7 @@ perf-util-y += copyfile.o
 perf-util-y += ctype.o
 perf-util-y += db-export.o
 perf-util-y += disasm.o
+perf-util-y += disasm_bpf.o
 perf-util-y += env.o
 perf-util-y += event.o
 perf-util-y += evlist.o
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 410e52cd9cfd..85fb0cfedf94 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -16,6 +16,7 @@
 #include "build-id.h"
 #include "debug.h"
 #include "disasm.h"
+#include "disasm_bpf.h"
 #include "dso.h"
 #include "env.h"
 #include "evsel.h"
@@ -1323,192 +1324,6 @@ fallback:
 	return 0;
 }
 
-#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT)
-#define PACKAGE "perf"
-#include <bfd.h>
-#include <dis-asm.h>
-#include <bpf/bpf.h>
-#include <bpf/btf.h>
-#include <bpf/libbpf.h>
-#include <linux/btf.h>
-#include <tools/dis-asm-compat.h>
-
-#include "bpf-event.h"
-#include "bpf-utils.h"
-
-static int symbol__disassemble_bpf(struct symbol *sym,
-				   struct annotate_args *args)
-{
-	struct annotation *notes = symbol__annotation(sym);
-	struct bpf_prog_linfo *prog_linfo = NULL;
-	struct bpf_prog_info_node *info_node;
-	int len = sym->end - sym->start;
-	disassembler_ftype disassemble;
-	struct map *map = args->ms.map;
-	struct perf_bpil *info_linear;
-	struct disassemble_info info;
-	struct dso *dso = map__dso(map);
-	int pc = 0, count, sub_id;
-	struct btf *btf = NULL;
-	char tpath[PATH_MAX];
-	size_t buf_size;
-	int nr_skip = 0;
-	char *buf;
-	bfd *bfdf;
-	int ret;
-	FILE *s;
-
-	if (dso__binary_type(dso) != DSO_BINARY_TYPE__BPF_PROG_INFO)
-		return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE;
-
-	pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__,
-		  sym->name, sym->start, sym->end - sym->start);
-
-	memset(tpath, 0, sizeof(tpath));
-	perf_exe(tpath, sizeof(tpath));
-
-	bfdf = bfd_openr(tpath, NULL);
-	if (bfdf == NULL)
-		abort();
-
-	if (!bfd_check_format(bfdf, bfd_object))
-		abort();
-
-	s = open_memstream(&buf, &buf_size);
-	if (!s) {
-		ret = errno;
-		goto out;
-	}
-	init_disassemble_info_compat(&info, s,
-				     (fprintf_ftype) fprintf,
-				     fprintf_styled);
-	info.arch = bfd_get_arch(bfdf);
-	info.mach = bfd_get_mach(bfdf);
-
-	info_node = perf_env__find_bpf_prog_info(dso__bpf_prog(dso)->env,
-						 dso__bpf_prog(dso)->id);
-	if (!info_node) {
-		ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF;
-		goto out;
-	}
-	info_linear = info_node->info_linear;
-	sub_id = dso__bpf_prog(dso)->sub_id;
-
-	info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns);
-	info.buffer_length = info_linear->info.jited_prog_len;
-
-	if (info_linear->info.nr_line_info)
-		prog_linfo = bpf_prog_linfo__new(&info_linear->info);
-
-	if (info_linear->info.btf_id) {
-		struct btf_node *node;
-
-		node = perf_env__find_btf(dso__bpf_prog(dso)->env,
-					  info_linear->info.btf_id);
-		if (node)
-			btf = btf__new((__u8 *)(node->data),
-				       node->data_size);
-	}
-
-	disassemble_init_for_target(&info);
-
-#ifdef DISASM_FOUR_ARGS_SIGNATURE
-	disassemble = disassembler(info.arch,
-				   bfd_big_endian(bfdf),
-				   info.mach,
-				   bfdf);
-#else
-	disassemble = disassembler(bfdf);
-#endif
-	if (disassemble == NULL)
-		abort();
-
-	fflush(s);
-	do {
-		const struct bpf_line_info *linfo = NULL;
-		struct disasm_line *dl;
-		size_t prev_buf_size;
-		const char *srcline;
-		u64 addr;
-
-		addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id];
-		count = disassemble(pc, &info);
-
-		if (prog_linfo)
-			linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo,
-								addr, sub_id,
-								nr_skip);
-
-		if (linfo && btf) {
-			srcline = btf__name_by_offset(btf, linfo->line_off);
-			nr_skip++;
-		} else
-			srcline = NULL;
-
-		fprintf(s, "\n");
-		prev_buf_size = buf_size;
-		fflush(s);
-
-		if (!annotate_opts.hide_src_code && srcline) {
-			args->offset = -1;
-			args->line = strdup(srcline);
-			args->line_nr = 0;
-			args->fileloc = NULL;
-			args->ms.sym  = sym;
-			dl = disasm_line__new(args);
-			if (dl) {
-				annotation_line__add(&dl->al,
-						     &notes->src->source);
-			}
-		}
-
-		args->offset = pc;
-		args->line = buf + prev_buf_size;
-		args->line_nr = 0;
-		args->fileloc = NULL;
-		args->ms.sym  = sym;
-		dl = disasm_line__new(args);
-		if (dl)
-			annotation_line__add(&dl->al, &notes->src->source);
-
-		pc += count;
-	} while (count > 0 && pc < len);
-
-	ret = 0;
-out:
-	free(prog_linfo);
-	btf__free(btf);
-	fclose(s);
-	bfd_close(bfdf);
-	return ret;
-}
-#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT)
-static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused,
-				   struct annotate_args *args __maybe_unused)
-{
-	return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF;
-}
-#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT)
-
-static int
-symbol__disassemble_bpf_image(struct symbol *sym,
-			      struct annotate_args *args)
-{
-	struct annotation *notes = symbol__annotation(sym);
-	struct disasm_line *dl;
-
-	args->offset = -1;
-	args->line = strdup("to be implemented");
-	args->line_nr = 0;
-	args->fileloc = NULL;
-	dl = disasm_line__new(args);
-	if (dl)
-		annotation_line__add(&dl->al, &notes->src->source);
-
-	zfree(&args->line);
-	return 0;
-}
-
 #ifdef HAVE_LIBCAPSTONE_SUPPORT
 #include <capstone/capstone.h>
 
diff --git a/tools/perf/util/disasm_bpf.c b/tools/perf/util/disasm_bpf.c
new file mode 100644
index 000000000000..1fee71c79b62
--- /dev/null
+++ b/tools/perf/util/disasm_bpf.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "util/annotate.h"
+#include "util/disasm_bpf.h"
+#include "util/symbol.h"
+#include <linux/zalloc.h>
+#include <string.h>
+
+#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT)
+#define PACKAGE "perf"
+#include <bfd.h>
+#include <bpf/bpf.h>
+#include <bpf/btf.h>
+#include <bpf/libbpf.h>
+#include <dis-asm.h>
+#include <errno.h>
+#include <linux/btf.h>
+#include <tools/dis-asm-compat.h>
+
+#include "util/bpf-event.h"
+#include "util/bpf-utils.h"
+#include "util/debug.h"
+#include "util/dso.h"
+#include "util/map.h"
+#include "util/env.h"
+#include "util/util.h"
+
+int symbol__disassemble_bpf(struct symbol *sym, struct annotate_args *args)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct bpf_prog_linfo *prog_linfo = NULL;
+	struct bpf_prog_info_node *info_node;
+	int len = sym->end - sym->start;
+	disassembler_ftype disassemble;
+	struct map *map = args->ms.map;
+	struct perf_bpil *info_linear;
+	struct disassemble_info info;
+	struct dso *dso = map__dso(map);
+	int pc = 0, count, sub_id;
+	struct btf *btf = NULL;
+	char tpath[PATH_MAX];
+	size_t buf_size;
+	int nr_skip = 0;
+	char *buf;
+	bfd *bfdf;
+	int ret;
+	FILE *s;
+
+	if (dso__binary_type(dso) != DSO_BINARY_TYPE__BPF_PROG_INFO)
+		return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE;
+
+	pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__,
+		  sym->name, sym->start, sym->end - sym->start);
+
+	memset(tpath, 0, sizeof(tpath));
+	perf_exe(tpath, sizeof(tpath));
+
+	bfdf = bfd_openr(tpath, NULL);
+	if (bfdf == NULL)
+		abort();
+
+	if (!bfd_check_format(bfdf, bfd_object))
+		abort();
+
+	s = open_memstream(&buf, &buf_size);
+	if (!s) {
+		ret = errno;
+		goto out;
+	}
+	init_disassemble_info_compat(&info, s,
+				     (fprintf_ftype) fprintf,
+				     fprintf_styled);
+	info.arch = bfd_get_arch(bfdf);
+	info.mach = bfd_get_mach(bfdf);
+
+	info_node = perf_env__find_bpf_prog_info(dso__bpf_prog(dso)->env,
+						 dso__bpf_prog(dso)->id);
+	if (!info_node) {
+		ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF;
+		goto out;
+	}
+	info_linear = info_node->info_linear;
+	sub_id = dso__bpf_prog(dso)->sub_id;
+
+	info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns);
+	info.buffer_length = info_linear->info.jited_prog_len;
+
+	if (info_linear->info.nr_line_info)
+		prog_linfo = bpf_prog_linfo__new(&info_linear->info);
+
+	if (info_linear->info.btf_id) {
+		struct btf_node *node;
+
+		node = perf_env__find_btf(dso__bpf_prog(dso)->env,
+					  info_linear->info.btf_id);
+		if (node)
+			btf = btf__new((__u8 *)(node->data),
+				       node->data_size);
+	}
+
+	disassemble_init_for_target(&info);
+
+#ifdef DISASM_FOUR_ARGS_SIGNATURE
+	disassemble = disassembler(info.arch,
+				   bfd_big_endian(bfdf),
+				   info.mach,
+				   bfdf);
+#else
+	disassemble = disassembler(bfdf);
+#endif
+	if (disassemble == NULL)
+		abort();
+
+	fflush(s);
+	do {
+		const struct bpf_line_info *linfo = NULL;
+		struct disasm_line *dl;
+		size_t prev_buf_size;
+		const char *srcline;
+		u64 addr;
+
+		addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id];
+		count = disassemble(pc, &info);
+
+		if (prog_linfo)
+			linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo,
+								addr, sub_id,
+								nr_skip);
+
+		if (linfo && btf) {
+			srcline = btf__name_by_offset(btf, linfo->line_off);
+			nr_skip++;
+		} else
+			srcline = NULL;
+
+		fprintf(s, "\n");
+		prev_buf_size = buf_size;
+		fflush(s);
+
+		if (!annotate_opts.hide_src_code && srcline) {
+			args->offset = -1;
+			args->line = strdup(srcline);
+			args->line_nr = 0;
+			args->fileloc = NULL;
+			args->ms.sym  = sym;
+			dl = disasm_line__new(args);
+			if (dl) {
+				annotation_line__add(&dl->al,
+						     &notes->src->source);
+			}
+		}
+
+		args->offset = pc;
+		args->line = buf + prev_buf_size;
+		args->line_nr = 0;
+		args->fileloc = NULL;
+		args->ms.sym  = sym;
+		dl = disasm_line__new(args);
+		if (dl)
+			annotation_line__add(&dl->al, &notes->src->source);
+
+		pc += count;
+	} while (count > 0 && pc < len);
+
+	ret = 0;
+out:
+	free(prog_linfo);
+	btf__free(btf);
+	fclose(s);
+	bfd_close(bfdf);
+	return ret;
+}
+#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT)
+int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, struct annotate_args *args __maybe_unused)
+{
+	return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF;
+}
+#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT)
+
+int symbol__disassemble_bpf_image(struct symbol *sym, struct annotate_args *args)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct disasm_line *dl;
+
+	args->offset = -1;
+	args->line = strdup("to be implemented");
+	args->line_nr = 0;
+	args->fileloc = NULL;
+	dl = disasm_line__new(args);
+	if (dl)
+		annotation_line__add(&dl->al, &notes->src->source);
+
+	zfree(&args->line);
+	return 0;
+}
diff --git a/tools/perf/util/disasm_bpf.h b/tools/perf/util/disasm_bpf.h
new file mode 100644
index 000000000000..2ecb19545388
--- /dev/null
+++ b/tools/perf/util/disasm_bpf.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef __PERF_DISASM_BPF_H
+#define __PERF_DISASM_BPF_H
+
+struct symbol;
+struct annotate_args;
+
+int symbol__disassemble_bpf(struct symbol *sym, struct annotate_args *args);
+int symbol__disassemble_bpf_image(struct symbol *sym, struct annotate_args *args);
+
+#endif /* __PERF_DISASM_BPF_H */
-- 
cgit v1.2.3


From 0fe881f10ceb7ae2be455d8bdf70fab18c0a5c5f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 30 Jul 2024 12:17:44 -0700
Subject: perf jevents: Autogenerate empty-pmu-events.c

empty-pmu-events.c exists so that builds may occur without python
being installed on a system. Manually updating empty-pmu-events.c to
be in sync with jevents.py is a pain, let's use jevents.py to generate
empty-pmu-events.c.

1) change jevents.py so that an arch and model of none cause
   generation of a pmu-events.c without any json. Add a SPDX and
   autogenerated warning to the start of the file.

2) change Build so that if a generated pmu-events.c for arch none and
   model none doesn't match empty-pmu-events.c the build fails with a
   cat of the differences. Update Makefile.perf to clean up the files
   used for this.

3) update empty-pmu-events.c to match the output of jevents.py with
   arch and mode of none.

Committer notes:

The firtst paragraph is confusing, so I asked and Ian further clarified:

 ---
The requirement for python hasn't changed.

Case 1: no python or NO_JEVENTS=1
Build happens using empty-pmu-events.c that is checked in, no python
is required.

Case 2: python
pmu-events.c is created by jevents.py (requiring python) and then built.
This change adds a step where the empty-pmu-events.c is created using
jevents.py and that file is diffed against the checked in version.

This stops the checked in empty-pmu-events.c diverging if changes are
made to jevents.py. If the diff causes the build to fail then you just
copy the diff empty-pmu-events.c over the checked in one.
 ---

Reviewed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oliver Sang <oliver.sang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Philip Li <philip.li@intel.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Link: https://lore.kernel.org/r/20240730191744.3097329-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf                 |   2 +
 tools/perf/pmu-events/Build              |  12 +-
 tools/perf/pmu-events/empty-pmu-events.c | 894 +++++++++++++++++++------------
 tools/perf/pmu-events/jevents.py         |   6 +-
 4 files changed, 562 insertions(+), 352 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index f8148db5fc38..72534bb72d43 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1277,6 +1277,8 @@ clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $(
 		$(OUTPUT)util/intel-pt-decoder/inat-tables.c \
 		$(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \
 		$(OUTPUT)pmu-events/pmu-events.c \
+		$(OUTPUT)pmu-events/test-empty-pmu-events.c \
+		$(OUTPUT)pmu-events/empty-pmu-events.log \
 		$(OUTPUT)pmu-events/metric_test.log \
 		$(OUTPUT)$(fadvise_advice_array) \
 		$(OUTPUT)$(fsconfig_arrays) \
diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index 1d18bb89402e..c3fa43c49706 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -11,6 +11,8 @@ METRIC_TEST_PY	=  pmu-events/metric_test.py
 EMPTY_PMU_EVENTS_C = pmu-events/empty-pmu-events.c
 PMU_EVENTS_C	=  $(OUTPUT)pmu-events/pmu-events.c
 METRIC_TEST_LOG	=  $(OUTPUT)pmu-events/metric_test.log
+TEST_EMPTY_PMU_EVENTS_C = $(OUTPUT)pmu-events/test-empty-pmu-events.c
+EMPTY_PMU_EVENTS_TEST_LOG = $(OUTPUT)pmu-events/empty-pmu-events.log
 
 ifeq ($(JEVENTS_ARCH),)
 JEVENTS_ARCH=$(SRCARCH)
@@ -31,7 +33,15 @@ $(METRIC_TEST_LOG): $(METRIC_TEST_PY) $(METRIC_PY)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,test)$(PYTHON) $< 2> $@ || (cat $@ && false)
 
-$(PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG)
+$(TEST_EMPTY_PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) none none pmu-events/arch $@
+
+$(EMPTY_PMU_EVENTS_TEST_LOG): $(EMPTY_PMU_EVENTS_C) $(TEST_EMPTY_PMU_EVENTS_C)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,test)diff -u $? 2> $@ || (cat $@ && false)
+
+$(PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG) $(EMPTY_PMU_EVENTS_TEST_LOG)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) $(JEVENTS_MODEL) pmu-events/arch $@
 endif
diff --git a/tools/perf/pmu-events/empty-pmu-events.c b/tools/perf/pmu-events/empty-pmu-events.c
index 13727421d424..c592079982fb 100644
--- a/tools/perf/pmu-events/empty-pmu-events.c
+++ b/tools/perf/pmu-events/empty-pmu-events.c
@@ -1,196 +1,193 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * An empty pmu-events.c file used when there is no architecture json files in
- * arch or when the jevents.py script cannot be run.
- *
- * The test cpu/soc is provided for testing.
- */
-#include "pmu-events/pmu-events.h"
+
+/* SPDX-License-Identifier: GPL-2.0 */
+/* THIS FILE WAS AUTOGENERATED BY jevents.py arch=none model=none ! */
+
+#include <pmu-events/pmu-events.h>
 #include "util/header.h"
 #include "util/pmu.h"
 #include <string.h>
 #include <stddef.h>
 
-static const struct pmu_event pmu_events__test_soc_cpu[] = {
-	{
-		.name = "l3_cache_rd",
-		.event = "event=0x40",
-		.desc = "L3 cache access, read",
-		.topic = "cache",
-		.long_desc = "Attributable Level 3 cache access, read",
-	},
-	{
-		.name = "segment_reg_loads.any",
-		.event = "event=0x6,period=200000,umask=0x80",
-		.desc = "Number of segment register loads",
-		.topic = "other",
-	},
-	{
-		.name = "dispatch_blocked.any",
-		.event = "event=0x9,period=200000,umask=0x20",
-		.desc = "Memory cluster signals to block micro-op dispatch for any reason",
-		.topic = "other",
-	},
-	{
-		.name = "eist_trans",
-		.event = "event=0x3a,period=200000,umask=0x0",
-		.desc = "Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions",
-		.topic = "other",
-	},
-	{
-		.name = "uncore_hisi_ddrc.flux_wcmd",
-		.event = "event=0x2",
-		.desc = "DDRC write commands. Unit: hisi_sccl,ddrc ",
-		.topic = "uncore",
-		.long_desc = "DDRC write commands",
-		.pmu = "hisi_sccl,ddrc",
-	},
-	{
-		.name = "unc_cbo_xsnp_response.miss_eviction",
-		.event = "event=0x22,umask=0x81",
-		.desc = "A cross-core snoop resulted from L3 Eviction which misses in some processor core. Unit: uncore_cbox ",
-		.topic = "uncore",
-		.long_desc = "A cross-core snoop resulted from L3 Eviction which misses in some processor core",
-		.pmu = "uncore_cbox",
-	},
-	{
-		.name = "event-hyphen",
-		.event = "event=0xe0,umask=0x00",
-		.desc = "UNC_CBO_HYPHEN. Unit: uncore_cbox ",
-		.topic = "uncore",
-		.long_desc = "UNC_CBO_HYPHEN",
-		.pmu = "uncore_cbox",
-	},
-	{
-		.name = "event-two-hyph",
-		.event = "event=0xc0,umask=0x00",
-		.desc = "UNC_CBO_TWO_HYPH. Unit: uncore_cbox ",
-		.topic = "uncore",
-		.long_desc = "UNC_CBO_TWO_HYPH",
-		.pmu = "uncore_cbox",
-	},
-	{
-		.name = "uncore_hisi_l3c.rd_hit_cpipe",
-		.event = "event=0x7",
-		.desc = "Total read hits. Unit: hisi_sccl,l3c ",
-		.topic = "uncore",
-		.long_desc = "Total read hits",
-		.pmu = "hisi_sccl,l3c",
-	},
-	{
-		.name = "uncore_imc_free_running.cache_miss",
-		.event = "event=0x12",
-		.desc = "Total cache misses. Unit: uncore_imc_free_running ",
-		.topic = "uncore",
-		.long_desc = "Total cache misses",
-		.pmu = "uncore_imc_free_running",
-	},
-	{
-		.name = "uncore_imc.cache_hits",
-		.event = "event=0x34",
-		.desc = "Total cache hits. Unit: uncore_imc ",
-		.topic = "uncore",
-		.long_desc = "Total cache hits",
-		.pmu = "uncore_imc",
-	},
-	{
-		.name = "bp_l1_btb_correct",
-		.event = "event=0x8a",
-		.desc = "L1 BTB Correction",
-		.topic = "branch",
-	},
-	{
-		.name = "bp_l2_btb_correct",
-		.event = "event=0x8b",
-		.desc = "L2 BTB Correction",
-		.topic = "branch",
-	},
-	{
-		.name = 0,
-		.event = 0,
-		.desc = 0,
-	},
+struct compact_pmu_event {
+        int offset;
 };
 
-static const struct pmu_metric pmu_metrics__test_soc_cpu[] = {
-	{
-		.metric_expr	= "1 / IPC",
-		.metric_name	= "CPI",
-	},
-	{
-		.metric_expr	= "inst_retired.any / cpu_clk_unhalted.thread",
-		.metric_name	= "IPC",
-		.metric_group	= "group1",
-	},
-	{
-		.metric_expr	= "idq_uops_not_delivered.core / (4 * (( ( cpu_clk_unhalted.thread / 2 ) * "
-		"( 1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk ) )))",
-		.metric_name	= "Frontend_Bound_SMT",
-	},
-	{
-		.metric_expr	= "l1d\\-loads\\-misses / inst_retired.any",
-		.metric_name	= "dcache_miss_cpi",
-	},
-	{
-		.metric_expr	= "l1i\\-loads\\-misses / inst_retired.any",
-		.metric_name	= "icache_miss_cycles",
-	},
-	{
-		.metric_expr	= "(dcache_miss_cpi + icache_miss_cycles)",
-		.metric_name	= "cache_miss_cycles",
-		.metric_group	= "group1",
-	},
-	{
-		.metric_expr	= "l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit",
-		.metric_name	= "DCache_L2_All_Hits",
-	},
-	{
-		.metric_expr	= "max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + "
-		"l2_rqsts.pf_miss + l2_rqsts.rfo_miss",
-		.metric_name	= "DCache_L2_All_Miss",
-	},
-	{
-		.metric_expr	= "DCache_L2_All_Hits + DCache_L2_All_Miss",
-		.metric_name	= "DCache_L2_All",
-	},
-	{
-		.metric_expr	= "d_ratio(DCache_L2_All_Hits, DCache_L2_All)",
-		.metric_name	= "DCache_L2_Hits",
-	},
-	{
-		.metric_expr	= "d_ratio(DCache_L2_All_Miss, DCache_L2_All)",
-		.metric_name	= "DCache_L2_Misses",
-	},
-	{
-		.metric_expr	= "ipc + M2",
-		.metric_name	= "M1",
-	},
-	{
-		.metric_expr	= "ipc + M1",
-		.metric_name	= "M2",
-	},
-	{
-		.metric_expr	= "1/M3",
-		.metric_name	= "M3",
-	},
-	{
-		.metric_expr	= "64 * l1d.replacement / 1000000000 / duration_time",
-		.metric_name	= "L1D_Cache_Fill_BW",
-	},
-	{
-		.metric_expr = 0,
-		.metric_name = 0,
-	},
+struct pmu_table_entry {
+        const struct compact_pmu_event *entries;
+        uint32_t num_entries;
+        struct compact_pmu_event pmu_name;
+};
+
+static const char *const big_c_string =
+/* offset=0 */ "default_core\000"
+/* offset=13 */ "bp_l1_btb_correct\000branch\000L1 BTB Correction\000event=0x8a\000\00000\000\000"
+/* offset=72 */ "bp_l2_btb_correct\000branch\000L2 BTB Correction\000event=0x8b\000\00000\000\000"
+/* offset=131 */ "l3_cache_rd\000cache\000L3 cache access, read\000event=0x40\000\00000\000Attributable Level 3 cache access, read\000"
+/* offset=226 */ "segment_reg_loads.any\000other\000Number of segment register loads\000event=6,period=200000,umask=0x80\000\00000\000\000"
+/* offset=325 */ "dispatch_blocked.any\000other\000Memory cluster signals to block micro-op dispatch for any reason\000event=9,period=200000,umask=0x20\000\00000\000\000"
+/* offset=455 */ "eist_trans\000other\000Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions\000event=0x3a,period=200000\000\00000\000\000"
+/* offset=570 */ "hisi_sccl,ddrc\000"
+/* offset=585 */ "uncore_hisi_ddrc.flux_wcmd\000uncore\000DDRC write commands\000event=2\000\00000\000DDRC write commands\000"
+/* offset=671 */ "uncore_cbox\000"
+/* offset=683 */ "unc_cbo_xsnp_response.miss_eviction\000uncore\000A cross-core snoop resulted from L3 Eviction which misses in some processor core\000event=0x22,umask=0x81\000\00000\000A cross-core snoop resulted from L3 Eviction which misses in some processor core\000"
+/* offset=914 */ "event-hyphen\000uncore\000UNC_CBO_HYPHEN\000event=0xe0\000\00000\000UNC_CBO_HYPHEN\000"
+/* offset=979 */ "event-two-hyph\000uncore\000UNC_CBO_TWO_HYPH\000event=0xc0\000\00000\000UNC_CBO_TWO_HYPH\000"
+/* offset=1050 */ "hisi_sccl,l3c\000"
+/* offset=1064 */ "uncore_hisi_l3c.rd_hit_cpipe\000uncore\000Total read hits\000event=7\000\00000\000Total read hits\000"
+/* offset=1144 */ "uncore_imc_free_running\000"
+/* offset=1168 */ "uncore_imc_free_running.cache_miss\000uncore\000Total cache misses\000event=0x12\000\00000\000Total cache misses\000"
+/* offset=1263 */ "uncore_imc\000"
+/* offset=1274 */ "uncore_imc.cache_hits\000uncore\000Total cache hits\000event=0x34\000\00000\000Total cache hits\000"
+/* offset=1352 */ "uncore_sys_ddr_pmu\000"
+/* offset=1371 */ "sys_ddr_pmu.write_cycles\000uncore\000ddr write-cycles event\000event=0x2b\000v8\00000\000\000"
+/* offset=1444 */ "uncore_sys_ccn_pmu\000"
+/* offset=1463 */ "sys_ccn_pmu.read_cycles\000uncore\000ccn read-cycles event\000config=0x2c\0000x01\00000\000\000"
+/* offset=1537 */ "uncore_sys_cmn_pmu\000"
+/* offset=1556 */ "sys_cmn_pmu.hnf_cache_miss\000uncore\000Counts total cache misses in first lookup result (high priority)\000eventid=1,type=5\000(434|436|43c|43a).*\00000\000\000"
+/* offset=1696 */ "CPI\000\0001 / IPC\000\000\000\000\000\000\000\00000"
+/* offset=1718 */ "IPC\000group1\000inst_retired.any / cpu_clk_unhalted.thread\000\000\000\000\000\000\000\00000"
+/* offset=1781 */ "Frontend_Bound_SMT\000\000idq_uops_not_delivered.core / (4 * (cpu_clk_unhalted.thread / 2 * (1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk)))\000\000\000\000\000\000\000\00000"
+/* offset=1947 */ "dcache_miss_cpi\000\000l1d\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\00000"
+/* offset=2011 */ "icache_miss_cycles\000\000l1i\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\00000"
+/* offset=2078 */ "cache_miss_cycles\000group1\000dcache_miss_cpi + icache_miss_cycles\000\000\000\000\000\000\000\00000"
+/* offset=2149 */ "DCache_L2_All_Hits\000\000l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit\000\000\000\000\000\000\000\00000"
+/* offset=2243 */ "DCache_L2_All_Miss\000\000max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + l2_rqsts.pf_miss + l2_rqsts.rfo_miss\000\000\000\000\000\000\000\00000"
+/* offset=2377 */ "DCache_L2_All\000\000DCache_L2_All_Hits + DCache_L2_All_Miss\000\000\000\000\000\000\000\00000"
+/* offset=2441 */ "DCache_L2_Hits\000\000d_ratio(DCache_L2_All_Hits, DCache_L2_All)\000\000\000\000\000\000\000\00000"
+/* offset=2509 */ "DCache_L2_Misses\000\000d_ratio(DCache_L2_All_Miss, DCache_L2_All)\000\000\000\000\000\000\000\00000"
+/* offset=2579 */ "M1\000\000ipc + M2\000\000\000\000\000\000\000\00000"
+/* offset=2601 */ "M2\000\000ipc + M1\000\000\000\000\000\000\000\00000"
+/* offset=2623 */ "M3\000\0001 / M3\000\000\000\000\000\000\000\00000"
+/* offset=2643 */ "L1D_Cache_Fill_BW\000\00064 * l1d.replacement / 1e9 / duration_time\000\000\000\000\000\000\000\00000"
+;
+
+static const struct compact_pmu_event pmu_events__test_soc_cpu_default_core[] = {
+{ 13 }, /* bp_l1_btb_correct\000branch\000L1 BTB Correction\000event=0x8a\000\00000\000\000 */
+{ 72 }, /* bp_l2_btb_correct\000branch\000L2 BTB Correction\000event=0x8b\000\00000\000\000 */
+{ 325 }, /* dispatch_blocked.any\000other\000Memory cluster signals to block micro-op dispatch for any reason\000event=9,period=200000,umask=0x20\000\00000\000\000 */
+{ 455 }, /* eist_trans\000other\000Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions\000event=0x3a,period=200000\000\00000\000\000 */
+{ 131 }, /* l3_cache_rd\000cache\000L3 cache access, read\000event=0x40\000\00000\000Attributable Level 3 cache access, read\000 */
+{ 226 }, /* segment_reg_loads.any\000other\000Number of segment register loads\000event=6,period=200000,umask=0x80\000\00000\000\000 */
+};
+static const struct compact_pmu_event pmu_events__test_soc_cpu_hisi_sccl_ddrc[] = {
+{ 585 }, /* uncore_hisi_ddrc.flux_wcmd\000uncore\000DDRC write commands\000event=2\000\00000\000DDRC write commands\000 */
+};
+static const struct compact_pmu_event pmu_events__test_soc_cpu_hisi_sccl_l3c[] = {
+{ 1064 }, /* uncore_hisi_l3c.rd_hit_cpipe\000uncore\000Total read hits\000event=7\000\00000\000Total read hits\000 */
+};
+static const struct compact_pmu_event pmu_events__test_soc_cpu_uncore_cbox[] = {
+{ 914 }, /* event-hyphen\000uncore\000UNC_CBO_HYPHEN\000event=0xe0\000\00000\000UNC_CBO_HYPHEN\000 */
+{ 979 }, /* event-two-hyph\000uncore\000UNC_CBO_TWO_HYPH\000event=0xc0\000\00000\000UNC_CBO_TWO_HYPH\000 */
+{ 683 }, /* unc_cbo_xsnp_response.miss_eviction\000uncore\000A cross-core snoop resulted from L3 Eviction which misses in some processor core\000event=0x22,umask=0x81\000\00000\000A cross-core snoop resulted from L3 Eviction which misses in some processor core\000 */
+};
+static const struct compact_pmu_event pmu_events__test_soc_cpu_uncore_imc[] = {
+{ 1274 }, /* uncore_imc.cache_hits\000uncore\000Total cache hits\000event=0x34\000\00000\000Total cache hits\000 */
+};
+static const struct compact_pmu_event pmu_events__test_soc_cpu_uncore_imc_free_running[] = {
+{ 1168 }, /* uncore_imc_free_running.cache_miss\000uncore\000Total cache misses\000event=0x12\000\00000\000Total cache misses\000 */
+
+};
+
+const struct pmu_table_entry pmu_events__test_soc_cpu[] = {
+{
+     .entries = pmu_events__test_soc_cpu_default_core,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_default_core),
+     .pmu_name = { 0 /* default_core\000 */ },
+},
+{
+     .entries = pmu_events__test_soc_cpu_hisi_sccl_ddrc,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_hisi_sccl_ddrc),
+     .pmu_name = { 570 /* hisi_sccl,ddrc\000 */ },
+},
+{
+     .entries = pmu_events__test_soc_cpu_hisi_sccl_l3c,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_hisi_sccl_l3c),
+     .pmu_name = { 1050 /* hisi_sccl,l3c\000 */ },
+},
+{
+     .entries = pmu_events__test_soc_cpu_uncore_cbox,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_uncore_cbox),
+     .pmu_name = { 671 /* uncore_cbox\000 */ },
+},
+{
+     .entries = pmu_events__test_soc_cpu_uncore_imc,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_uncore_imc),
+     .pmu_name = { 1263 /* uncore_imc\000 */ },
+},
+{
+     .entries = pmu_events__test_soc_cpu_uncore_imc_free_running,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_uncore_imc_free_running),
+     .pmu_name = { 1144 /* uncore_imc_free_running\000 */ },
+},
 };
 
+static const struct compact_pmu_event pmu_metrics__test_soc_cpu_default_core[] = {
+{ 1696 }, /* CPI\000\0001 / IPC\000\000\000\000\000\000\000\00000 */
+{ 2377 }, /* DCache_L2_All\000\000DCache_L2_All_Hits + DCache_L2_All_Miss\000\000\000\000\000\000\000\00000 */
+{ 2149 }, /* DCache_L2_All_Hits\000\000l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit\000\000\000\000\000\000\000\00000 */
+{ 2243 }, /* DCache_L2_All_Miss\000\000max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + l2_rqsts.pf_miss + l2_rqsts.rfo_miss\000\000\000\000\000\000\000\00000 */
+{ 2441 }, /* DCache_L2_Hits\000\000d_ratio(DCache_L2_All_Hits, DCache_L2_All)\000\000\000\000\000\000\000\00000 */
+{ 2509 }, /* DCache_L2_Misses\000\000d_ratio(DCache_L2_All_Miss, DCache_L2_All)\000\000\000\000\000\000\000\00000 */
+{ 1781 }, /* Frontend_Bound_SMT\000\000idq_uops_not_delivered.core / (4 * (cpu_clk_unhalted.thread / 2 * (1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk)))\000\000\000\000\000\000\000\00000 */
+{ 1718 }, /* IPC\000group1\000inst_retired.any / cpu_clk_unhalted.thread\000\000\000\000\000\000\000\00000 */
+{ 2643 }, /* L1D_Cache_Fill_BW\000\00064 * l1d.replacement / 1e9 / duration_time\000\000\000\000\000\000\000\00000 */
+{ 2579 }, /* M1\000\000ipc + M2\000\000\000\000\000\000\000\00000 */
+{ 2601 }, /* M2\000\000ipc + M1\000\000\000\000\000\000\000\00000 */
+{ 2623 }, /* M3\000\0001 / M3\000\000\000\000\000\000\000\00000 */
+{ 2078 }, /* cache_miss_cycles\000group1\000dcache_miss_cpi + icache_miss_cycles\000\000\000\000\000\000\000\00000 */
+{ 1947 }, /* dcache_miss_cpi\000\000l1d\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\00000 */
+{ 2011 }, /* icache_miss_cycles\000\000l1i\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\00000 */
+
+};
+
+const struct pmu_table_entry pmu_metrics__test_soc_cpu[] = {
+{
+     .entries = pmu_metrics__test_soc_cpu_default_core,
+     .num_entries = ARRAY_SIZE(pmu_metrics__test_soc_cpu_default_core),
+     .pmu_name = { 0 /* default_core\000 */ },
+},
+};
+
+static const struct compact_pmu_event pmu_events__test_soc_sys_uncore_sys_ccn_pmu[] = {
+{ 1463 }, /* sys_ccn_pmu.read_cycles\000uncore\000ccn read-cycles event\000config=0x2c\0000x01\00000\000\000 */
+};
+static const struct compact_pmu_event pmu_events__test_soc_sys_uncore_sys_cmn_pmu[] = {
+{ 1556 }, /* sys_cmn_pmu.hnf_cache_miss\000uncore\000Counts total cache misses in first lookup result (high priority)\000eventid=1,type=5\000(434|436|43c|43a).*\00000\000\000 */
+};
+static const struct compact_pmu_event pmu_events__test_soc_sys_uncore_sys_ddr_pmu[] = {
+{ 1371 }, /* sys_ddr_pmu.write_cycles\000uncore\000ddr write-cycles event\000event=0x2b\000v8\00000\000\000 */
+
+};
+
+const struct pmu_table_entry pmu_events__test_soc_sys[] = {
+{
+     .entries = pmu_events__test_soc_sys_uncore_sys_ccn_pmu,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_sys_uncore_sys_ccn_pmu),
+     .pmu_name = { 1444 /* uncore_sys_ccn_pmu\000 */ },
+},
+{
+     .entries = pmu_events__test_soc_sys_uncore_sys_cmn_pmu,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_sys_uncore_sys_cmn_pmu),
+     .pmu_name = { 1537 /* uncore_sys_cmn_pmu\000 */ },
+},
+{
+     .entries = pmu_events__test_soc_sys_uncore_sys_ddr_pmu,
+     .num_entries = ARRAY_SIZE(pmu_events__test_soc_sys_uncore_sys_ddr_pmu),
+     .pmu_name = { 1352 /* uncore_sys_ddr_pmu\000 */ },
+},
+};
+
+
 /* Struct used to make the PMU event table implementation opaque to callers. */
 struct pmu_events_table {
-	const struct pmu_event *entries;
+        const struct pmu_table_entry *pmus;
+        uint32_t num_pmus;
 };
 
 /* Struct used to make the PMU metric table implementation opaque to callers. */
 struct pmu_metrics_table {
-	const struct pmu_metric *entries;
+        const struct pmu_table_entry *pmus;
+        uint32_t num_pmus;
 };
 
 /*
@@ -202,92 +199,191 @@ struct pmu_metrics_table {
  * The  cpuid can contain any character other than the comma.
  */
 struct pmu_events_map {
-	const char *arch;
-	const char *cpuid;
-	const struct pmu_events_table event_table;
-	const struct pmu_metrics_table metric_table;
+        const char *arch;
+        const char *cpuid;
+        struct pmu_events_table event_table;
+        struct pmu_metrics_table metric_table;
 };
 
 /*
  * Global table mapping each known CPU for the architecture to its
  * table of PMU events.
  */
-static const struct pmu_events_map pmu_events_map[] = {
-	{
-		.arch = "testarch",
-		.cpuid = "testcpu",
-		.event_table = { pmu_events__test_soc_cpu },
-		.metric_table = { pmu_metrics__test_soc_cpu },
-	},
-	{
-		.arch = 0,
-		.cpuid = 0,
-		.event_table = { 0 },
-		.metric_table = { 0 },
-	},
-};
-
-static const struct pmu_event pmu_events__test_soc_sys[] = {
-	{
-		.name = "sys_ddr_pmu.write_cycles",
-		.event = "event=0x2b",
-		.desc = "ddr write-cycles event. Unit: uncore_sys_ddr_pmu ",
-		.compat = "v8",
-		.topic = "uncore",
-		.pmu = "uncore_sys_ddr_pmu",
-	},
-	{
-		.name = "sys_ccn_pmu.read_cycles",
-		.event = "config=0x2c",
-		.desc = "ccn read-cycles event. Unit: uncore_sys_ccn_pmu ",
-		.compat = "0x01",
-		.topic = "uncore",
-		.pmu = "uncore_sys_ccn_pmu",
-	},
-	{
-		.name = "sys_cmn_pmu.hnf_cache_miss",
-		.event = "eventid=0x1,type=0x5",
-		.desc = "Counts total cache misses in first lookup result (high priority). Unit: uncore_sys_cmn_pmu ",
-		.compat = "(434|436|43c|43a).*",
-		.topic = "uncore",
-		.pmu = "uncore_sys_cmn_pmu",
-	},
-	{
-		.name = 0,
-		.event = 0,
-		.desc = 0,
-	},
+const struct pmu_events_map pmu_events_map[] = {
+{
+	.arch = "testarch",
+	.cpuid = "testcpu",
+	.event_table = {
+		.pmus = pmu_events__test_soc_cpu,
+		.num_pmus = ARRAY_SIZE(pmu_events__test_soc_cpu),
+	},
+	.metric_table = {
+		.pmus = pmu_metrics__test_soc_cpu,
+		.num_pmus = ARRAY_SIZE(pmu_metrics__test_soc_cpu),
+	}
+},
+{
+	.arch = 0,
+	.cpuid = 0,
+	.event_table = { 0, 0 },
+	.metric_table = { 0, 0 },
+}
 };
 
 struct pmu_sys_events {
 	const char *name;
-	const struct pmu_events_table table;
+	struct pmu_events_table event_table;
+	struct pmu_metrics_table metric_table;
 };
 
 static const struct pmu_sys_events pmu_sys_event_tables[] = {
 	{
-		.table = { pmu_events__test_soc_sys },
+		.event_table = {
+			.pmus = pmu_events__test_soc_sys,
+			.num_pmus = ARRAY_SIZE(pmu_events__test_soc_sys)
+		},
 		.name = "pmu_events__test_soc_sys",
 	},
 	{
-		.table = { 0 }
+		.event_table = { 0, 0 },
+		.metric_table = { 0, 0 },
 	},
 };
 
-int pmu_events_table__for_each_event(const struct pmu_events_table *table, struct perf_pmu *pmu,
-				     pmu_event_iter_fn fn, void *data)
+static void decompress_event(int offset, struct pmu_event *pe)
+{
+	const char *p = &big_c_string[offset];
+
+	pe->name = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pe->topic = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pe->desc = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pe->event = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pe->compat = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pe->deprecated = *p - '0';
+	p++;
+	pe->perpkg = *p - '0';
+	p++;
+	pe->unit = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pe->long_desc = (*p == '\0' ? NULL : p);
+}
+
+static void decompress_metric(int offset, struct pmu_metric *pm)
 {
-	for (const struct pmu_event *pe = &table->entries[0]; pe->name; pe++) {
-		int ret;
+	const char *p = &big_c_string[offset];
+
+	pm->metric_name = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->metric_group = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->metric_expr = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->metric_threshold = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->desc = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->long_desc = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->unit = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->compat = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->metricgroup_no_group = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->default_metricgroup_name = (*p == '\0' ? NULL : p);
+	while (*p++);
+	pm->aggr_mode = *p - '0';
+	p++;
+	pm->event_grouping = *p - '0';
+}
 
-                if (pmu && !pmu__name_match(pmu, pe->pmu))
+static int pmu_events_table__for_each_event_pmu(const struct pmu_events_table *table,
+                                                const struct pmu_table_entry *pmu,
+                                                pmu_event_iter_fn fn,
+                                                void *data)
+{
+        int ret;
+        struct pmu_event pe = {
+                .pmu = &big_c_string[pmu->pmu_name.offset],
+        };
+
+        for (uint32_t i = 0; i < pmu->num_entries; i++) {
+                decompress_event(pmu->entries[i].offset, &pe);
+                if (!pe.name)
                         continue;
+                ret = fn(&pe, table, data);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+ }
+
+static int pmu_events_table__find_event_pmu(const struct pmu_events_table *table,
+                                            const struct pmu_table_entry *pmu,
+                                            const char *name,
+                                            pmu_event_iter_fn fn,
+                                            void *data)
+{
+        struct pmu_event pe = {
+                .pmu = &big_c_string[pmu->pmu_name.offset],
+        };
+        int low = 0, high = pmu->num_entries - 1;
 
-		ret = fn(pe, table, data);
-		if (ret)
-			return ret;
-	}
-	return 0;
+        while (low <= high) {
+                int cmp, mid = (low + high) / 2;
+
+                decompress_event(pmu->entries[mid].offset, &pe);
+
+                if (!pe.name && !name)
+                        goto do_call;
+
+                if (!pe.name && name) {
+                        low = mid + 1;
+                        continue;
+                }
+                if (pe.name && !name) {
+                        high = mid - 1;
+                        continue;
+                }
+
+                cmp = strcasecmp(pe.name, name);
+                if (cmp < 0) {
+                        low = mid + 1;
+                        continue;
+                }
+                if (cmp > 0) {
+                        high = mid - 1;
+                        continue;
+                }
+  do_call:
+                return fn ? fn(&pe, table, data) : 0;
+        }
+        return PMU_EVENTS__NOT_FOUND;
+}
+
+int pmu_events_table__for_each_event(const struct pmu_events_table *table,
+                                    struct perf_pmu *pmu,
+                                    pmu_event_iter_fn fn,
+                                    void *data)
+{
+        for (size_t i = 0; i < table->num_pmus; i++) {
+                const struct pmu_table_entry *table_pmu = &table->pmus[i];
+                const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset];
+                int ret;
+
+                if (pmu && !pmu__name_match(pmu, pmu_name))
+                        continue;
+
+                ret = pmu_events_table__for_each_event_pmu(table, table_pmu, fn, data);
+                if (pmu || ret)
+                        return ret;
+        }
+        return 0;
 }
 
 int pmu_events_table__find_event(const struct pmu_events_table *table,
@@ -296,14 +392,19 @@ int pmu_events_table__find_event(const struct pmu_events_table *table,
                                  pmu_event_iter_fn fn,
                                  void *data)
 {
-	for (const struct pmu_event *pe = &table->entries[0]; pe->name; pe++) {
-                if (pmu && !pmu__name_match(pmu, pe->pmu))
+        for (size_t i = 0; i < table->num_pmus; i++) {
+                const struct pmu_table_entry *table_pmu = &table->pmus[i];
+                const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset];
+                int ret;
+
+                if (!pmu__name_match(pmu, pmu_name))
                         continue;
 
-		if (!strcasecmp(pe->name, name))
-			return fn(pe, table, data);
-	}
-        return -1000;
+                ret = pmu_events_table__find_event_pmu(table, table_pmu, name, fn, data);
+                if (ret != PMU_EVENTS__NOT_FOUND)
+                        return ret;
+        }
+        return PMU_EVENTS__NOT_FOUND;
 }
 
 size_t pmu_events_table__num_events(const struct pmu_events_table *table,
@@ -311,160 +412,253 @@ size_t pmu_events_table__num_events(const struct pmu_events_table *table,
 {
         size_t count = 0;
 
-	for (const struct pmu_event *pe = &table->entries[0]; pe->name; pe++) {
-                if (pmu && !pmu__name_match(pmu, pe->pmu))
-                        continue;
+        for (size_t i = 0; i < table->num_pmus; i++) {
+                const struct pmu_table_entry *table_pmu = &table->pmus[i];
+                const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset];
 
-		count++;
-	}
+                if (pmu__name_match(pmu, pmu_name))
+                        count += table_pmu->num_entries;
+        }
         return count;
 }
 
-int pmu_metrics_table__for_each_metric(const struct pmu_metrics_table *table, pmu_metric_iter_fn fn,
-				      void *data)
+static int pmu_metrics_table__for_each_metric_pmu(const struct pmu_metrics_table *table,
+                                                const struct pmu_table_entry *pmu,
+                                                pmu_metric_iter_fn fn,
+                                                void *data)
+{
+        int ret;
+        struct pmu_metric pm = {
+                .pmu = &big_c_string[pmu->pmu_name.offset],
+        };
+
+        for (uint32_t i = 0; i < pmu->num_entries; i++) {
+                decompress_metric(pmu->entries[i].offset, &pm);
+                if (!pm.metric_expr)
+                        continue;
+                ret = fn(&pm, table, data);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+
+int pmu_metrics_table__for_each_metric(const struct pmu_metrics_table *table,
+                                     pmu_metric_iter_fn fn,
+                                     void *data)
 {
-	for (const struct pmu_metric *pm = &table->entries[0]; pm->metric_expr; pm++) {
-		int ret = fn(pm, table, data);
+        for (size_t i = 0; i < table->num_pmus; i++) {
+                int ret = pmu_metrics_table__for_each_metric_pmu(table, &table->pmus[i],
+                                                                 fn, data);
+
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
 
-		if (ret)
-			return ret;
-	}
-	return 0;
+static const struct pmu_events_map *map_for_pmu(struct perf_pmu *pmu)
+{
+        static struct {
+                const struct pmu_events_map *map;
+                struct perf_pmu *pmu;
+        } last_result;
+        static struct {
+                const struct pmu_events_map *map;
+                char *cpuid;
+        } last_map_search;
+        static bool has_last_result, has_last_map_search;
+        const struct pmu_events_map *map = NULL;
+        char *cpuid = NULL;
+        size_t i;
+
+        if (has_last_result && last_result.pmu == pmu)
+                return last_result.map;
+
+        cpuid = perf_pmu__getcpuid(pmu);
+
+        /*
+         * On some platforms which uses cpus map, cpuid can be NULL for
+         * PMUs other than CORE PMUs.
+         */
+        if (!cpuid)
+                goto out_update_last_result;
+
+        if (has_last_map_search && !strcmp(last_map_search.cpuid, cpuid)) {
+                map = last_map_search.map;
+                free(cpuid);
+        } else {
+                i = 0;
+                for (;;) {
+                        map = &pmu_events_map[i++];
+
+                        if (!map->arch) {
+                                map = NULL;
+                                break;
+                        }
+
+                        if (!strcmp_cpuid_str(map->cpuid, cpuid))
+                                break;
+               }
+               free(last_map_search.cpuid);
+               last_map_search.cpuid = cpuid;
+               last_map_search.map = map;
+               has_last_map_search = true;
+        }
+out_update_last_result:
+        last_result.pmu = pmu;
+        last_result.map = map;
+        has_last_result = true;
+        return map;
 }
 
 const struct pmu_events_table *perf_pmu__find_events_table(struct perf_pmu *pmu)
 {
-	const struct pmu_events_table *table = NULL;
-	char *cpuid = perf_pmu__getcpuid(pmu);
-	int i;
+        const struct pmu_events_map *map = map_for_pmu(pmu);
 
-	/* on some platforms which uses cpus map, cpuid can be NULL for
-	 * PMUs other than CORE PMUs.
-	 */
-	if (!cpuid)
-		return NULL;
+        if (!map)
+                return NULL;
 
-	i = 0;
-	for (;;) {
-		const struct pmu_events_map *map = &pmu_events_map[i++];
+        if (!pmu)
+                return &map->event_table;
 
-		if (!map->cpuid)
-			break;
+        for (size_t i = 0; i < map->event_table.num_pmus; i++) {
+                const struct pmu_table_entry *table_pmu = &map->event_table.pmus[i];
+                const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset];
 
-		if (!strcmp_cpuid_str(map->cpuid, cpuid)) {
-			table = &map->event_table;
-			break;
-		}
-	}
-	free(cpuid);
-	return table;
+                if (pmu__name_match(pmu, pmu_name))
+                         return &map->event_table;
+        }
+        return NULL;
 }
 
 const struct pmu_metrics_table *perf_pmu__find_metrics_table(struct perf_pmu *pmu)
 {
-	const struct pmu_metrics_table *table = NULL;
-	char *cpuid = perf_pmu__getcpuid(pmu);
-	int i;
+        const struct pmu_events_map *map = map_for_pmu(pmu);
 
-	/* on some platforms which uses cpus map, cpuid can be NULL for
-	 * PMUs other than CORE PMUs.
-	 */
-	if (!cpuid)
-		return NULL;
+        if (!map)
+                return NULL;
 
-	i = 0;
-	for (;;) {
-		const struct pmu_events_map *map = &pmu_events_map[i++];
+        if (!pmu)
+                return &map->metric_table;
 
-		if (!map->cpuid)
-			break;
+        for (size_t i = 0; i < map->metric_table.num_pmus; i++) {
+                const struct pmu_table_entry *table_pmu = &map->metric_table.pmus[i];
+                const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset];
 
-		if (!strcmp_cpuid_str(map->cpuid, cpuid)) {
-			table = &map->metric_table;
-			break;
-		}
-	}
-	free(cpuid);
-	return table;
+                if (pmu__name_match(pmu, pmu_name))
+                           return &map->metric_table;
+        }
+        return NULL;
 }
 
 const struct pmu_events_table *find_core_events_table(const char *arch, const char *cpuid)
 {
-	for (const struct pmu_events_map *tables = &pmu_events_map[0];
-	     tables->arch;
-	     tables++) {
-		if (!strcmp(tables->arch, arch) && !strcmp_cpuid_str(tables->cpuid, cpuid))
-			return &tables->event_table;
-	}
-	return NULL;
+        for (const struct pmu_events_map *tables = &pmu_events_map[0];
+             tables->arch;
+             tables++) {
+                if (!strcmp(tables->arch, arch) && !strcmp_cpuid_str(tables->cpuid, cpuid))
+                        return &tables->event_table;
+        }
+        return NULL;
 }
 
 const struct pmu_metrics_table *find_core_metrics_table(const char *arch, const char *cpuid)
 {
-	for (const struct pmu_events_map *tables = &pmu_events_map[0];
-	     tables->arch;
-	     tables++) {
-		if (!strcmp(tables->arch, arch) && !strcmp_cpuid_str(tables->cpuid, cpuid))
-			return &tables->metric_table;
-	}
-	return NULL;
+        for (const struct pmu_events_map *tables = &pmu_events_map[0];
+             tables->arch;
+             tables++) {
+                if (!strcmp(tables->arch, arch) && !strcmp_cpuid_str(tables->cpuid, cpuid))
+                        return &tables->metric_table;
+        }
+        return NULL;
 }
 
 int pmu_for_each_core_event(pmu_event_iter_fn fn, void *data)
 {
-	for (const struct pmu_events_map *tables = &pmu_events_map[0]; tables->arch; tables++) {
-		int ret = pmu_events_table__for_each_event(&tables->event_table,
-							   /*pmu=*/ NULL, fn, data);
-
-		if (ret)
-			return ret;
-	}
-	return 0;
+        for (const struct pmu_events_map *tables = &pmu_events_map[0];
+             tables->arch;
+             tables++) {
+                int ret = pmu_events_table__for_each_event(&tables->event_table,
+                                                           /*pmu=*/ NULL, fn, data);
+
+                if (ret)
+                        return ret;
+        }
+        return 0;
 }
 
 int pmu_for_each_core_metric(pmu_metric_iter_fn fn, void *data)
 {
-	for (const struct pmu_events_map *tables = &pmu_events_map[0];
-	     tables->arch;
-	     tables++) {
-		int ret = pmu_metrics_table__for_each_metric(&tables->metric_table, fn, data);
-
-		if (ret)
-			return ret;
-	}
-	return 0;
+        for (const struct pmu_events_map *tables = &pmu_events_map[0];
+             tables->arch;
+             tables++) {
+                int ret = pmu_metrics_table__for_each_metric(&tables->metric_table, fn, data);
+
+                if (ret)
+                        return ret;
+        }
+        return 0;
 }
 
 const struct pmu_events_table *find_sys_events_table(const char *name)
 {
-	for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0];
-	     tables->name;
-	     tables++) {
-		if (!strcmp(tables->name, name))
-			return &tables->table;
-	}
-	return NULL;
+        for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0];
+             tables->name;
+             tables++) {
+                if (!strcmp(tables->name, name))
+                        return &tables->event_table;
+        }
+        return NULL;
 }
 
 int pmu_for_each_sys_event(pmu_event_iter_fn fn, void *data)
 {
-	for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0];
-	     tables->name;
-	     tables++) {
-		int ret = pmu_events_table__for_each_event(&tables->table, /*pmu=*/ NULL, fn, data);
-
-		if (ret)
-			return ret;
-	}
-	return 0;
+        for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0];
+             tables->name;
+             tables++) {
+                int ret = pmu_events_table__for_each_event(&tables->event_table,
+                                                           /*pmu=*/ NULL, fn, data);
+
+                if (ret)
+                        return ret;
+        }
+        return 0;
 }
 
-int pmu_for_each_sys_metric(pmu_metric_iter_fn fn __maybe_unused, void *data __maybe_unused)
+int pmu_for_each_sys_metric(pmu_metric_iter_fn fn, void *data)
 {
-	return 0;
+        for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0];
+             tables->name;
+             tables++) {
+                int ret = pmu_metrics_table__for_each_metric(&tables->metric_table, fn, data);
+
+                if (ret)
+                        return ret;
+        }
+        return 0;
 }
 
-const char *describe_metricgroup(const char *group __maybe_unused)
+static const int metricgroups[][2] = {
+
+};
+
+const char *describe_metricgroup(const char *group)
 {
-	return NULL;
+        int low = 0, high = (int)ARRAY_SIZE(metricgroups) - 1;
+
+        while (low <= high) {
+                int mid = (low + high) / 2;
+                const char *mgroup = &big_c_string[metricgroups[mid][0]];
+                int cmp = strcmp(mgroup, group);
+
+                if (cmp == 0) {
+                        return &big_c_string[metricgroups[mid][1]];
+                } else if (cmp < 0) {
+                        low = mid + 1;
+                } else {
+                        high = mid - 1;
+                }
+        }
+        return NULL;
 }
diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py
index 731776e29f47..fcf0158438b5 100755
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -1256,6 +1256,10 @@ such as "arm/cortex-a34".''',
       'output_file', type=argparse.FileType('w', encoding='utf-8'), nargs='?', default=sys.stdout)
   _args = ap.parse_args()
 
+  _args.output_file.write(f"""
+/* SPDX-License-Identifier: GPL-2.0 */
+/* THIS FILE WAS AUTOGENERATED BY jevents.py arch={_args.arch} model={_args.model} ! */
+""")
   _args.output_file.write("""
 #include <pmu-events/pmu-events.h>
 #include "util/header.h"
@@ -1281,7 +1285,7 @@ struct pmu_table_entry {
     if item.name == _args.arch or _args.arch == 'all' or item.name == 'test':
       archs.append(item.name)
 
-  if len(archs) < 2:
+  if len(archs) < 2 and _args.arch != 'none':
     raise IOError(f'Missing architecture directory \'{_args.arch}\'')
 
   archs.sort()
-- 
cgit v1.2.3


From 7c5dd51bbb6767095df8a5e9e2b4528b8af18538 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 31 Jul 2024 16:00:05 -0700
Subject: perf python: Remove PYTHON_PERF ifdefs

When perf code was compiled one way for the binary and another for the
python module, the PYTHON_PERF ifdef was used to remove some code from
the python module.

Since switching to building the perf code as a series of libraries, with
the same libraries being used for the python module, the ifdefs became
unused as PYTHON_PERF is never defined. As such remove the ifdefs.

Fixes: 9dabf4003423c8d3 ("perf python: Switch module to linking libraries from building source")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240731230005.12295-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel_fprintf.c | 2 --
 tools/perf/util/mmap.c          | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c
index 8719b3cb5646..c2c0500d5da9 100644
--- a/tools/perf/util/evsel_fprintf.c
+++ b/tools/perf/util/evsel_fprintf.c
@@ -107,7 +107,6 @@ out:
 	return ++printed;
 }
 
-#ifndef PYTHON_PERF
 int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
 			      unsigned int print_opts, struct callchain_cursor *cursor,
 			      struct strlist *bt_stop_list, FILE *fp)
@@ -248,4 +247,3 @@ int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
 
 	return printed;
 }
-#endif /* PYTHON_PERF */
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 122ee198a86e..43b02293f1d2 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -230,9 +230,7 @@ void mmap__munmap(struct mmap *map)
 {
 	bitmap_free(map->affinity_mask.bits);
 
-#ifndef PYTHON_PERF
 	zstd_fini(&map->zstd_data);
-#endif
 
 	perf_mmap__aio_munmap(map);
 	if (map->data != NULL) {
@@ -295,12 +293,10 @@ int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, struct perf_cpu
 
 	map->core.flush = mp->flush;
 
-#ifndef PYTHON_PERF
 	if (zstd_init(&map->zstd_data, mp->comp_level)) {
 		pr_debug2("failed to init mmap compressor, error %d\n", errno);
 		return -1;
 	}
-#endif
 
 	if (mp->comp_level && !perf_mmap__aio_enabled(map)) {
 		map->data = mmap(NULL, mmap__mmap_len(map), PROT_READ|PROT_WRITE,
-- 
cgit v1.2.3


From 96465e0179fa8a7059bd48a81a81889dcad7d543 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 31 Jul 2024 16:55:00 -0700
Subject: perf hist: Correct hist_entry->mem_info refcounts

The 'struct mem_info' is created by iter_prepare_mem_entry() at the
beginning and destroyed by iter_finish_mem_entry() at the end.

So if it's used in a new hist_entry, it should be cloned.

Simplify (hopefully) the logic by adding some helper functions and by
not holding the refcount in the temporary entry.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240731235505.710436-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c       | 14 +++++++-------
 tools/perf/util/map_symbol.c | 18 ++++++++++++++++++
 tools/perf/util/map_symbol.h |  3 +++
 tools/perf/util/mem-info.c   | 13 +++++++++++++
 tools/perf/util/mem-info.h   |  1 +
 5 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index f028f113c4fd..f8ee1cd6929d 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -476,6 +476,12 @@ static int hist_entry__init(struct hist_entry *he,
 		he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map);
 	}
 
+	if (he->mem_info) {
+		he->mem_info = mem_info__clone(template->mem_info);
+		if (he->mem_info == NULL)
+			goto err_infos;
+	}
+
 	if (hist_entry__has_callchains(he) && symbol_conf.use_callchain)
 		callchain_init(he->callchain);
 
@@ -620,12 +626,6 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 			if (symbol_conf.cumulate_callchain)
 				he_stat__add_period(he->stat_acc, period);
 
-			/*
-			 * This mem info was allocated from sample__resolve_mem
-			 * and will not be used anymore.
-			 */
-			mem_info__zput(entry->mem_info);
-
 			block_info__delete(entry->block_info);
 
 			kvm_info__zput(entry->kvm_info);
@@ -739,7 +739,7 @@ __hists__add_entry(struct hists *hists,
 		.filtered = symbol__parent_filter(sym_parent) | al->filtered,
 		.hists	= hists,
 		.branch_info = bi,
-		.mem_info = mem_info__get(mi),
+		.mem_info = mi,
 		.kvm_info = ki,
 		.block_info = block_info,
 		.transaction = sample->transaction,
diff --git a/tools/perf/util/map_symbol.c b/tools/perf/util/map_symbol.c
index bef5079f2403..6ad2960bc289 100644
--- a/tools/perf/util/map_symbol.c
+++ b/tools/perf/util/map_symbol.c
@@ -13,3 +13,21 @@ void addr_map_symbol__exit(struct addr_map_symbol *ams)
 {
 	map_symbol__exit(&ams->ms);
 }
+
+void map_symbol__copy(struct map_symbol *dst, struct map_symbol *src)
+{
+	dst->maps = maps__get(src->maps);
+	dst->map = map__get(src->map);
+	dst->sym = src->sym;
+}
+
+void addr_map_symbol__copy(struct addr_map_symbol *dst, struct addr_map_symbol *src)
+{
+	map_symbol__copy(&dst->ms, &src->ms);
+
+	dst->addr = src->addr;
+	dst->al_addr = src->al_addr;
+	dst->al_level = src->al_level;
+	dst->phys_addr = src->phys_addr;
+	dst->data_page_size = src->data_page_size;
+}
diff --git a/tools/perf/util/map_symbol.h b/tools/perf/util/map_symbol.h
index 72d5ed938ed6..e370bb32ed47 100644
--- a/tools/perf/util/map_symbol.h
+++ b/tools/perf/util/map_symbol.h
@@ -26,4 +26,7 @@ struct addr_map_symbol {
 void map_symbol__exit(struct map_symbol *ms);
 void addr_map_symbol__exit(struct addr_map_symbol *ams);
 
+void map_symbol__copy(struct map_symbol *dst, struct map_symbol *src);
+void addr_map_symbol__copy(struct addr_map_symbol *dst, struct addr_map_symbol *src);
+
 #endif // __PERF_MAP_SYMBOL
diff --git a/tools/perf/util/mem-info.c b/tools/perf/util/mem-info.c
index 27d67721a695..d3efa9c139f2 100644
--- a/tools/perf/util/mem-info.c
+++ b/tools/perf/util/mem-info.c
@@ -33,3 +33,16 @@ struct mem_info *mem_info__new(void)
 
 	return result;
 }
+
+struct mem_info *mem_info__clone(struct mem_info *mi)
+{
+	struct mem_info *result = mem_info__new();
+
+	if (result) {
+		addr_map_symbol__copy(mem_info__iaddr(result), mem_info__iaddr(mi));
+		addr_map_symbol__copy(mem_info__daddr(result), mem_info__daddr(mi));
+		mem_info__data_src(result)->val = mem_info__data_src(mi)->val;
+	}
+
+	return result;
+}
diff --git a/tools/perf/util/mem-info.h b/tools/perf/util/mem-info.h
index 0f68e29f311b..df75e94ed3d0 100644
--- a/tools/perf/util/mem-info.h
+++ b/tools/perf/util/mem-info.h
@@ -15,6 +15,7 @@ DECLARE_RC_STRUCT(mem_info) {
 };
 
 struct mem_info *mem_info__new(void);
+struct mem_info *mem_info__clone(struct mem_info *mi);
 struct mem_info *mem_info__get(struct mem_info *mi);
 void   mem_info__put(struct mem_info *mi);
 
-- 
cgit v1.2.3


From 3da209bb1177462b6fe8e3021a5527a5a49a9336 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 31 Jul 2024 16:55:01 -0700
Subject: perf mem: Free the allocated sort string, fixing a leak

The get_sort_order() returns either a new string (from strdup) or NULL
but it never gets freed.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Fixes: 2e7f545096f954a9 ("perf mem: Factor out a function to generate sort order")
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240731235505.710436-3-namhyung@kernel.org
[ Added Fixes tag ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-mem.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 863fcd735dae..93413cfcd585 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -372,6 +372,7 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
 		rep_argv[i] = argv[j];
 
 	ret = cmd_report(i, rep_argv);
+	free(new_sort_order);
 	free(rep_argv);
 	return ret;
 }
-- 
cgit v1.2.3


From 35b38a71c92fa033aa6c8d681ee827e215254964 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 31 Jul 2024 16:55:02 -0700
Subject: perf mem: Rework command option handling

Split the common option and ones for record or report.  Otherwise -U in
the record option cannot be used because it clashes with in the common
(or report) option.  Also rename report_events() to __cmd_report() to
follow the convention and to be sync with the record part.

Also set the flag PARSE_OPT_STOP_AT_NON_OPTION for the common option so
that it can show the help message in the subcommand like below:

  $ perf mem record -h

   Usage: perf mem record [<options>] [<command>]
      or: perf mem record [<options>] -- <command> [<options>]

      -C, --cpu <cpu>       list of cpus to profile
      -e, --event <event>   event selector. use 'perf mem record -e list' to list available events
      -f, --force           don't complain, do it
      -K, --all-kernel      collect only kernel level data
      -p, --phys-data       Record/Report sample physical addresses
      -t, --type <type>     memory operations(load,store) Default load,store
      -U, --all-user        collect only user level data
      -v, --verbose         be more verbose (show counter open errors, etc)
          --data-page-size  Record/Report sample data address page size
          --ldlat <n>       mem-loads latency

Cc: Aditya Gupta <adityag@linux.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240731235505.710436-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-mem.c | 79 +++++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 93413cfcd585..819edaf6b1df 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -34,6 +34,8 @@ struct perf_mem {
 	bool			force;
 	bool			phys_addr;
 	bool			data_page_size;
+	bool			all_kernel;
+	bool			all_user;
 	int			operation;
 	const char		*cpu_list;
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -62,33 +64,19 @@ static int parse_record_events(const struct option *opt,
 	return 0;
 }
 
-static const char * const __usage[] = {
-	"perf mem record [<options>] [<command>]",
-	"perf mem record [<options>] -- <command> [<options>]",
-	NULL
-};
-
-static const char * const *record_mem_usage = __usage;
-
-static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
+static int __cmd_record(int argc, const char **argv, struct perf_mem *mem,
+			const struct option *options)
 {
 	int rec_argc, i = 0, j;
 	int start, end;
 	const char **rec_argv;
 	int ret;
-	bool all_user = false, all_kernel = false;
 	struct perf_mem_event *e;
 	struct perf_pmu *pmu;
-	struct option options[] = {
-	OPT_CALLBACK('e', "event", &mem, "event",
-		     "event selector. use 'perf mem record -e list' to list available events",
-		     parse_record_events),
-	OPT_UINTEGER(0, "ldlat", &perf_mem_events__loads_ldlat, "mem-loads latency"),
-	OPT_INCR('v', "verbose", &verbose,
-		 "be more verbose (show counter open errors, etc)"),
-	OPT_BOOLEAN('U', "all-user", &all_user, "collect only user level data"),
-	OPT_BOOLEAN('K', "all-kernel", &all_kernel, "collect only kernel level data"),
-	OPT_END()
+	const char * const record_usage[] = {
+		"perf mem record [<options>] [<command>]",
+		"perf mem record [<options>] -- <command> [<options>]",
+		NULL
 	};
 
 	pmu = perf_mem_events_find_pmu();
@@ -102,7 +90,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 		return -1;
 	}
 
-	argc = parse_options(argc, argv, options, record_mem_usage,
+	argc = parse_options(argc, argv, options, record_usage,
 			     PARSE_OPT_KEEP_UNKNOWN);
 
 	/* Max number of arguments multiplied by number of PMUs that can support them. */
@@ -158,10 +146,10 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 		goto out;
 	end = i;
 
-	if (all_user)
+	if (mem->all_user)
 		rec_argv[i++] = "--all-user";
 
-	if (all_kernel)
+	if (mem->all_kernel)
 		rec_argv[i++] = "--all-kernel";
 
 	if (mem->cpu_list) {
@@ -319,6 +307,7 @@ out_delete:
 	perf_session__delete(session);
 	return ret;
 }
+
 static char *get_sort_order(struct perf_mem *mem)
 {
 	bool has_extra_options = (mem->phys_addr | mem->data_page_size) ? true : false;
@@ -346,11 +335,19 @@ static char *get_sort_order(struct perf_mem *mem)
 	return strdup(sort);
 }
 
-static int report_events(int argc, const char **argv, struct perf_mem *mem)
+static int __cmd_report(int argc, const char **argv, struct perf_mem *mem,
+			const struct option *options)
 {
 	const char **rep_argv;
 	int ret, i = 0, j, rep_argc;
 	char *new_sort_order;
+	const char * const report_usage[] = {
+		"perf mem report [<options>]",
+		NULL
+	};
+
+	argc = parse_options(argc, argv, options, report_usage,
+			     PARSE_OPT_KEEP_UNKNOWN);
 
 	if (mem->dump_raw)
 		return report_raw_events(mem);
@@ -368,7 +365,7 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
 	if (new_sort_order)
 		rep_argv[i++] = new_sort_order;
 
-	for (j = 1; j < argc; j++, i++)
+	for (j = 0; j < argc; j++, i++)
 		rep_argv[i] = argv[j];
 
 	ret = cmd_report(i, rep_argv);
@@ -475,22 +472,36 @@ int cmd_mem(int argc, const char **argv)
 	OPT_CALLBACK('t', "type", &mem.operation,
 		   "type", "memory operations(load,store) Default load,store",
 		    parse_mem_ops),
+	OPT_STRING('C', "cpu", &mem.cpu_list, "cpu",
+		   "list of cpus to profile"),
+	OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
+	OPT_INCR('v', "verbose", &verbose,
+		 "be more verbose (show counter open errors, etc)"),
+	OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"),
+	OPT_BOOLEAN(0, "data-page-size", &mem.data_page_size, "Record/Report sample data address page size"),
+	OPT_END()
+	};
+	const struct option record_options[] = {
+	OPT_CALLBACK('e', "event", &mem, "event",
+		     "event selector. use 'perf mem record -e list' to list available events",
+		     parse_record_events),
+	OPT_UINTEGER(0, "ldlat", &perf_mem_events__loads_ldlat, "mem-loads latency"),
+	OPT_BOOLEAN('U', "all-user", &mem.all_user, "collect only user level data"),
+	OPT_BOOLEAN('K', "all-kernel", &mem.all_kernel, "collect only kernel level data"),
+	OPT_PARENT(mem_options)
+	};
+	const struct option report_options[] = {
 	OPT_BOOLEAN('D', "dump-raw-samples", &mem.dump_raw,
 		    "dump raw samples in ASCII"),
 	OPT_BOOLEAN('U', "hide-unresolved", &mem.hide_unresolved,
 		    "Only display entries resolved to a symbol"),
 	OPT_STRING('i', "input", &input_name, "file",
 		   "input file name"),
-	OPT_STRING('C', "cpu", &mem.cpu_list, "cpu",
-		   "list of cpus to profile"),
 	OPT_STRING_NOEMPTY('x', "field-separator", &symbol_conf.field_sep,
 		   "separator",
 		   "separator for columns, no spaces will be added"
 		   " between columns '.' is reserved."),
-	OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
-	OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"),
-	OPT_BOOLEAN(0, "data-page-size", &mem.data_page_size, "Record/Report sample data address page size"),
-	OPT_END()
+	OPT_PARENT(mem_options)
 	};
 	const char *const mem_subcommands[] = { "record", "report", NULL };
 	const char *mem_usage[] = {
@@ -499,7 +510,7 @@ int cmd_mem(int argc, const char **argv)
 	};
 
 	argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands,
-					mem_usage, PARSE_OPT_KEEP_UNKNOWN);
+					mem_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 
 	if (!argc || !(strncmp(argv[0], "rec", 3) || mem.operation))
 		usage_with_options(mem_usage, mem_options);
@@ -512,9 +523,9 @@ int cmd_mem(int argc, const char **argv)
 	}
 
 	if (strlen(argv[0]) > 2 && strstarts("record", argv[0]))
-		return __cmd_record(argc, argv, &mem);
+		return __cmd_record(argc, argv, &mem, record_options);
 	else if (strlen(argv[0]) > 2 && strstarts("report", argv[0]))
-		return report_events(argc, argv, &mem);
+		return __cmd_report(argc, argv, &mem, report_options);
 	else
 		usage_with_options(mem_usage, mem_options);
 
-- 
cgit v1.2.3


From 871893d748cce346097d907b9937604af3dafcf1 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 31 Jul 2024 16:55:03 -0700
Subject: perf tools: Add mode argument to sort_help()

Some sort keys are meaningful only in a specific mode - like branch
stack and memory (data-src).  Add the mode to skip unnecessary ones.
This will be used for 'perf mem report' later.

While at it, change the prefix for the -F/--fields option to remove
the duplicate part.

Before:

  $ perf report -F
   Error: switch `F' requires a value
   Usage: perf report [<options>]

      -F, --fields <key[,keys...]>
  			  output field(s): overhead period sample  overhead overhead_sys
  			  overhead_us overhead_guest_sys overhead_guest_us overhead_children
  			  sample period weight1 weight2 weight3 ins_lat retire_lat
  			  ...
After:

  $ perf report -F
   Error: switch `F' requires a value
   Usage: perf report [<options>]

      -F, --fields <key[,keys...]>
  			  output field(s): overhead overhead_sys overhead_us
  			  overhead_guest_sys overhead_guest_us overhead_children
  			  sample period weight1 weight2 weight3 ins_lat retire_lat
  			  ...

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240731235505.710436-5-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |  4 ++--
 tools/perf/util/sort.c      | 12 +++++++-----
 tools/perf/util/sort.h      |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 6edc0d4ce6fb..930052961c1a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1301,8 +1301,8 @@ int cmd_report(int argc, const char **argv)
 		.socket_filter		 = -1,
 		.skip_empty		 = true,
 	};
-	char *sort_order_help = sort_help("sort by key(s):");
-	char *field_order_help = sort_help("output field(s): overhead period sample ");
+	char *sort_order_help = sort_help("sort by key(s):", SORT_MODE__NORMAL);
+	char *field_order_help = sort_help("output field(s):", SORT_MODE__NORMAL);
 	const char *disassembler_style = NULL, *objdump_path = NULL, *addr2line_path = NULL;
 	const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index ab7c7ff35f9b..c4046d5d1749 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -3960,7 +3960,7 @@ static void add_hpp_sort_string(struct strbuf *sb, struct hpp_dimension *s, int
 		add_key(sb, s[i].name, llen);
 }
 
-char *sort_help(const char *prefix)
+char *sort_help(const char *prefix, enum sort_mode mode)
 {
 	struct strbuf sb;
 	char *s;
@@ -3972,10 +3972,12 @@ char *sort_help(const char *prefix)
 			    ARRAY_SIZE(hpp_sort_dimensions), &len);
 	add_sort_string(&sb, common_sort_dimensions,
 			    ARRAY_SIZE(common_sort_dimensions), &len);
-	add_sort_string(&sb, bstack_sort_dimensions,
-			    ARRAY_SIZE(bstack_sort_dimensions), &len);
-	add_sort_string(&sb, memory_sort_dimensions,
-			    ARRAY_SIZE(memory_sort_dimensions), &len);
+	if (mode == SORT_MODE__NORMAL || mode == SORT_MODE__BRANCH)
+		add_sort_string(&sb, bstack_sort_dimensions,
+				ARRAY_SIZE(bstack_sort_dimensions), &len);
+	if (mode == SORT_MODE__NORMAL || mode == SORT_MODE__MEMORY)
+		add_sort_string(&sb, memory_sort_dimensions,
+				ARRAY_SIZE(memory_sort_dimensions), &len);
 	s = strbuf_detach(&sb, NULL);
 	strbuf_release(&sb);
 	return s;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 0bd0ee3ae76b..6357bc32c5ca 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -130,7 +130,7 @@ void reset_output_field(void);
 void sort__setup_elide(FILE *fp);
 void perf_hpp__set_elide(int idx, bool elide);
 
-char *sort_help(const char *prefix);
+char *sort_help(const char *prefix, enum sort_mode mode);
 
 int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, int unset);
 
-- 
cgit v1.2.3


From 2d99a991337f70a7d11135d6a539bed8c060cb0f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 31 Jul 2024 16:55:04 -0700
Subject: perf mem: Add -s/--sort option

So that users can set the sort key manually as they want.

  $ perf mem report -s
   Error: switch `s' requires a value
   Usage: perf mem report [<options>]

      -s, --sort <key[,key2...]>
                          sort by key(s): overhead overhead_sys overhead_us overhead_guest_sys
  			  overhead_guest_us overhead_children sample period
  			  weight1 weight2 weight3 ins_lat retire_lat p_stage_cyc
  			  pid comm dso symbol parent cpu socket srcline srcfile
  			  local_weight weight transaction trace symbol_size
  			  dso_size cgroup cgroup_id ipc_null time code_page_size
  			  local_ins_lat ins_lat local_p_stage_cyc p_stage_cyc
  			  addr local_retire_lat retire_lat simd type typeoff
  			  symoff symbol_daddr dso_daddr locked tlb mem snoop
  			  dcacheline symbol_iaddr phys_daddr data_page_size
  			  blocked

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240731235505.710436-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-mem.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 819edaf6b1df..24a4f0084f49 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -19,6 +19,7 @@
 #include "util/symbol.h"
 #include "util/pmus.h"
 #include "util/sample.h"
+#include "util/sort.h"
 #include "util/string2.h"
 #include "util/util.h"
 #include <linux/err.h>
@@ -28,7 +29,8 @@
 
 struct perf_mem {
 	struct perf_tool	tool;
-	char const		*input_name;
+	const char		*input_name;
+	const char		*sort_key;
 	bool			hide_unresolved;
 	bool			dump_raw;
 	bool			force;
@@ -313,11 +315,13 @@ static char *get_sort_order(struct perf_mem *mem)
 	bool has_extra_options = (mem->phys_addr | mem->data_page_size) ? true : false;
 	char sort[128];
 
+	if (mem->sort_key)
+		scnprintf(sort, sizeof(sort), "--sort=%s", mem->sort_key);
 	/*
 	 * there is no weight (cost) associated with stores, so don't print
 	 * the column
 	 */
-	if (!(mem->operation & MEM_OPERATION_LOAD)) {
+	else if (!(mem->operation & MEM_OPERATION_LOAD)) {
 		strcpy(sort, "--sort=mem,sym,dso,symbol_daddr,"
 			     "dso_daddr,tlb,locked");
 	} else if (has_extra_options) {
@@ -468,6 +472,7 @@ int cmd_mem(int argc, const char **argv)
 		 */
 		.operation		 = MEM_OPERATION_LOAD | MEM_OPERATION_STORE,
 	};
+	char *sort_order_help = sort_help("sort by key(s):", SORT_MODE__MEMORY);
 	const struct option mem_options[] = {
 	OPT_CALLBACK('t', "type", &mem.operation,
 		   "type", "memory operations(load,store) Default load,store",
@@ -501,6 +506,8 @@ int cmd_mem(int argc, const char **argv)
 		   "separator",
 		   "separator for columns, no spaces will be added"
 		   " between columns '.' is reserved."),
+	OPT_STRING('s', "sort", &mem.sort_key, "key[,key2...]",
+		   sort_order_help),
 	OPT_PARENT(mem_options)
 	};
 	const char *const mem_subcommands[] = { "record", "report", NULL };
-- 
cgit v1.2.3


From 7320ad972510415515d00838451c51f2db3974a7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 31 Jul 2024 16:55:05 -0700
Subject: perf mem: Add -T/--data-type option to report subcommand

This is just a shortcut to have 'type' in the sort key and use more
compact output format like below.

  $ perf mem report -T
  ...
  #
  # Overhead       Samples  Memory access                            Snoop         TLB access              Data Type
  # ........  ............  .......................................  ............  ......................  .........
  #
      14.84%            22  L1 hit                                   None          L1 or L2 hit            (unknown)
       7.68%             8  LFB/MAB hit                              None          L1 or L2 hit            (unknown)
       7.17%             3  RAM hit                                  Hit           L2 miss                 (unknown)
       6.29%            12  L1 hit                                   None          L1 or L2 hit            (stack operation)
       4.85%             5  RAM hit                                  Hit           L1 or L2 hit            (unknown)
       3.97%             5  LFB/MAB hit                              None          L1 or L2 hit            struct psi_group_cpu
       3.18%             3  LFB/MAB hit                              None          L1 or L2 hit            (stack operation)
       2.58%             3  L1 hit                                   None          L1 or L2 hit            unsigned int
       2.36%             2  L1 hit                                   None          L1 or L2 hit            struct
       2.31%             2  L1 hit                                   None          L1 or L2 hit            struct psi_group_cpu
  ...

Users also can use their own sort keys and -T option makes sure it has
the 'type' sort key at the end.

  $ perf mem report -T -s mem

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240731235505.710436-7-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-mem.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 24a4f0084f49..efa700d14c98 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -38,6 +38,7 @@ struct perf_mem {
 	bool			data_page_size;
 	bool			all_kernel;
 	bool			all_user;
+	bool			data_type;
 	int			operation;
 	const char		*cpu_list;
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -317,6 +318,8 @@ static char *get_sort_order(struct perf_mem *mem)
 
 	if (mem->sort_key)
 		scnprintf(sort, sizeof(sort), "--sort=%s", mem->sort_key);
+	else if (mem->data_type)
+		strcpy(sort, "--sort=mem,snoop,tlb,type");
 	/*
 	 * there is no weight (cost) associated with stores, so don't print
 	 * the column
@@ -336,6 +339,10 @@ static char *get_sort_order(struct perf_mem *mem)
 	if (mem->data_page_size)
 		strcat(sort, ",data_page_size");
 
+	/* make sure it has 'type' sort key even -s option is used */
+	if (mem->data_type && !strstr(sort, "type"))
+		strcat(sort, ",type");
+
 	return strdup(sort);
 }
 
@@ -508,6 +515,8 @@ int cmd_mem(int argc, const char **argv)
 		   " between columns '.' is reserved."),
 	OPT_STRING('s', "sort", &mem.sort_key, "key[,key2...]",
 		   sort_order_help),
+	OPT_BOOLEAN('T', "type-profile", &mem.data_type,
+		    "Show data-type profile result"),
 	OPT_PARENT(mem_options)
 	};
 	const char *const mem_subcommands[] = { "record", "report", NULL };
-- 
cgit v1.2.3


From 39e470057f785eab7b6638feb3f24cbad6816aff Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwi@linutronix.de>
Date: Thu, 18 Jul 2024 15:47:41 +0200
Subject: tools/x86/kcpuid: Remove unused variable

Global variable "num_leafs" is set in multiple places but is never read
anywhere.  Remove it.

Signed-off-by: Ahmed S. Darwish <darwi@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240718134755.378115-2-darwi@linutronix.de
---
 tools/arch/x86/kcpuid/kcpuid.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c
index 24b7d017ec2c..e1973d8b322e 100644
--- a/tools/arch/x86/kcpuid/kcpuid.c
+++ b/tools/arch/x86/kcpuid/kcpuid.c
@@ -76,7 +76,6 @@ struct cpuid_range {
  */
 struct cpuid_range *leafs_basic, *leafs_ext;
 
-static int num_leafs;
 static bool is_amd;
 static bool show_details;
 static bool show_raw;
@@ -246,7 +245,6 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax)
 		allzero = cpuid_store(range, f, subleaf, eax, ebx, ecx, edx);
 		if (allzero)
 			continue;
-		num_leafs++;
 
 		if (!has_subleafs(f))
 			continue;
@@ -272,7 +270,6 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax)
 						eax, ebx, ecx, edx);
 			if (allzero)
 				continue;
-			num_leafs++;
 		}
 
 	}
-- 
cgit v1.2.3


From a52e735f282c963155090d2d60726324ccd0e4bc Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwi@linutronix.de>
Date: Thu, 18 Jul 2024 15:47:42 +0200
Subject: tools/x86/kcpuid: Properly align long-description columns

When kcpuid is invoked with "--all --details", the detailed description
column is not properly aligned for all bitfield rows:

CPUID_0x4_ECX[0x0]:
	 cache_level        	: 0x1       	- Cache Level ...
	 cache_self_init     - Cache Self Initialization

This is due to differences in output handling between boolean single-bit
"bitflags" and multi-bit bitfields.  For the former, the bitfield's value
is not outputted as it is implied to be true by just outputting the
bitflag's name in its respective line.

If long descriptions were requested through the --all parameter, properly
align the bitflag's description columns through extra tabs.  With that,
the sample output above becomes:

CPUID_0x4_ECX[0x0]:
	 cache_level        	: 0x1       	- Cache Level ...
	 cache_self_init     			- Cache Self Initialization

Signed-off-by: Ahmed S. Darwish <darwi@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240718134755.378115-3-darwi@linutronix.de
---
 tools/arch/x86/kcpuid/kcpuid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c
index e1973d8b322e..08f64d9ecb40 100644
--- a/tools/arch/x86/kcpuid/kcpuid.c
+++ b/tools/arch/x86/kcpuid/kcpuid.c
@@ -449,8 +449,9 @@ static void decode_bits(u32 value, struct reg_desc *rdesc, enum cpuid_reg reg)
 		if (start == end) {
 			/* single bit flag */
 			if (value & (1 << start))
-				printf("\t%-20s %s%s\n",
+				printf("\t%-20s %s%s%s\n",
 					bdesc->simp,
+				        show_flags_only ? "" : "\t\t\t",
 					show_details ? "-" : "",
 					show_details ? bdesc->detail : ""
 					);
-- 
cgit v1.2.3


From 5dd7ca42475bb15fd93d58d42e925512776f14a4 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwi@linutronix.de>
Date: Thu, 18 Jul 2024 15:47:43 +0200
Subject: tools/x86/kcpuid: Set max possible subleaves count to 64

cpuid.csv will be extended in further commits with all-publicly-known
CPUID leaves and bitfields.  One of the new leaves is 0xd for extended
CPU state enumeration.  Depending on XCR0 dword bits, it can export up to
64 subleaves.

Set kcpuid.c MAX_SUBLEAF_NUM to 64.

Signed-off-by: Ahmed S. Darwish <darwi@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240718134755.378115-4-darwi@linutronix.de
---
 tools/arch/x86/kcpuid/kcpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c
index 08f64d9ecb40..a87cddc19554 100644
--- a/tools/arch/x86/kcpuid/kcpuid.c
+++ b/tools/arch/x86/kcpuid/kcpuid.c
@@ -203,7 +203,7 @@ static void raw_dump_range(struct cpuid_range *range)
 	}
 }
 
-#define MAX_SUBLEAF_NUM		32
+#define MAX_SUBLEAF_NUM		64
 struct cpuid_range *setup_cpuid_range(u32 input_eax)
 {
 	u32 max_func, idx_func;
-- 
cgit v1.2.3


From cf96ab1a966b87b09fdd9e8cc8357d2d00776a3a Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwi@linutronix.de>
Date: Thu, 18 Jul 2024 15:47:44 +0200
Subject: tools/x86/kcpuid: Protect against faulty "max subleaf" values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Protect against the kcpuid code parsing faulty max subleaf numbers
through a min() expression.  Thus, ensuring that max_subleaf will always
be ≤ MAX_SUBLEAF_NUM.

Use "u32" for the subleaf numbers since kcpuid is compiled with -Wextra,
which includes signed/unsigned comparisons warnings.

Signed-off-by: Ahmed S. Darwish <darwi@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240718134755.378115-5-darwi@linutronix.de
---
 tools/arch/x86/kcpuid/kcpuid.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c
index a87cddc19554..581d28c861ee 100644
--- a/tools/arch/x86/kcpuid/kcpuid.c
+++ b/tools/arch/x86/kcpuid/kcpuid.c
@@ -7,7 +7,8 @@
 #include <string.h>
 #include <getopt.h>
 
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define ARRAY_SIZE(x)	(sizeof(x) / sizeof((x)[0]))
+#define min(a, b)	(((a) < (b)) ? (a) : (b))
 
 typedef unsigned int u32;
 typedef unsigned long long u64;
@@ -206,12 +207,9 @@ static void raw_dump_range(struct cpuid_range *range)
 #define MAX_SUBLEAF_NUM		64
 struct cpuid_range *setup_cpuid_range(u32 input_eax)
 {
-	u32 max_func, idx_func;
-	int subleaf;
+	u32 max_func, idx_func, subleaf, max_subleaf;
+	u32 eax, ebx, ecx, edx, f = input_eax;
 	struct cpuid_range *range;
-	u32 eax, ebx, ecx, edx;
-	u32 f = input_eax;
-	int max_subleaf;
 	bool allzero;
 
 	eax = input_eax;
@@ -256,7 +254,7 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax)
 		 * others have to be tried (0xf)
 		 */
 		if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18)
-			max_subleaf = (eax & 0xff) + 1;
+			max_subleaf = min((eax & 0xff) + 1, max_subleaf);
 
 		if (f == 0xb)
 			max_subleaf = 2;
-- 
cgit v1.2.3


From 9ecbc60a5ede928abdd2b152d828ae0ea8a1e3ed Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwi@linutronix.de>
Date: Thu, 18 Jul 2024 15:47:45 +0200
Subject: tools/x86/kcpuid: Strip bitfield names leading/trailing whitespace

While parsing and saving bitfield names from the CSV file, an extra
leading space is copied verbatim.  That extra space is not a big issue
now, but further commits will add a new CSV file with much more padding
for the bitfield's name column.

Strip leading/trailing whitespaces while saving bitfield names.

Signed-off-by: Ahmed S. Darwish <darwi@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240718134755.378115-6-darwi@linutronix.de
---
 tools/arch/x86/kcpuid/kcpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c
index 581d28c861ee..c4f0ace5c9ff 100644
--- a/tools/arch/x86/kcpuid/kcpuid.c
+++ b/tools/arch/x86/kcpuid/kcpuid.c
@@ -379,7 +379,7 @@ static int parse_line(char *line)
 	if (start)
 		bdesc->start = strtoul(start, NULL, 0);
 
-	strcpy(bdesc->simp, tokens[4]);
+	strcpy(bdesc->simp, strtok(tokens[4], " \t"));
 	strcpy(bdesc->detail, tokens[5]);
 	return 0;
 
-- 
cgit v1.2.3


From b0a59d14966dbfe0d544b2595dcb29728d41daf3 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwi@linutronix.de>
Date: Thu, 18 Jul 2024 15:47:46 +0200
Subject: tools/x86/kcpuid: Recognize all leaves with subleaves

cpuid.csv will be extended in further commits with all-publicly-known
CPUID leaves and bitfields.  Thus, modify has_subleafs() to identify all
known leaves with subleaves.

Remove the redundant "is_amd" check since all x86 vendors already report
the maxium supported extended leaf at leaf 0x80000000 EAX register.

The extra mentioned leaves are:

  - Leaf 0x12, Intel Software Guard Extensions (SGX) enumeration
  - Leaf 0x14, Intel process trace (PT) enumeration
  - Leaf 0x17, Intel SoC vendor attributes enumeration
  - Leaf 0x1b, Intel PCONFIG (Platform configuration) enumeration
  - Leaf 0x1d, Intel AMX (Advanced Matrix Extensions) tile information
  - Leaf 0x1f, Intel v2 extended topology enumeration
  - Leaf 0x23, Intel ArchPerfmonExt (Architectural PMU ext) enumeration
  - Leaf 0x80000020, AMD Platform QoS extended features enumeration
  - Leaf 0x80000026, AMD v2 extended topology enumeration

Set the 'max_subleaf' variable for all the newly marked leaves with extra
subleaves.  Ideally, this should be fetched from the CSV file instead,
but the current kcpuid code architecture has two runs: one run to
serially invoke the cpuid instructions and save all the output in-memory,
and one run to parse this in-memory output through the CSV specification.

Signed-off-by: Ahmed S. Darwish <darwi@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240718134755.378115-7-darwi@linutronix.de
---
 tools/arch/x86/kcpuid/kcpuid.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c
index c4f0ace5c9ff..725a7a27fcac 100644
--- a/tools/arch/x86/kcpuid/kcpuid.c
+++ b/tools/arch/x86/kcpuid/kcpuid.c
@@ -98,27 +98,17 @@ static inline void cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 
 static inline bool has_subleafs(u32 f)
 {
-	if (f == 0x7 || f == 0xd)
-		return true;
-
-	if (is_amd) {
-		if (f == 0x8000001d)
+	u32 with_subleaves[] = {
+		0x4,  0x7,  0xb,  0xd,  0xf,  0x10, 0x12,
+		0x14, 0x17, 0x18, 0x1b, 0x1d, 0x1f, 0x23,
+		0x8000001d, 0x80000020, 0x80000026,
+	};
+
+	for (unsigned i = 0; i < ARRAY_SIZE(with_subleaves); i++)
+		if (f == with_subleaves[i])
 			return true;
-		return false;
-	}
 
-	switch (f) {
-	case 0x4:
-	case 0xb:
-	case 0xf:
-	case 0x10:
-	case 0x14:
-	case 0x18:
-	case 0x1f:
-		return true;
-	default:
-		return false;
-	}
+	return false;
 }
 
 static void leaf_print_raw(struct subleaf *leaf)
@@ -253,11 +243,18 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax)
 		 * Some can provide the exact number of subleafs,
 		 * others have to be tried (0xf)
 		 */
-		if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18)
+		if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18 || f == 0x1d)
 			max_subleaf = min((eax & 0xff) + 1, max_subleaf);
-
 		if (f == 0xb)
 			max_subleaf = 2;
+		if (f == 0x1f)
+			max_subleaf = 6;
+		if (f == 0x23)
+			max_subleaf = 4;
+		if (f == 0x80000020)
+			max_subleaf = 4;
+		if (f == 0x80000026)
+			max_subleaf = 5;
 
 		for (subleaf = 1; subleaf < max_subleaf; subleaf++) {
 			eax = f;
-- 
cgit v1.2.3


From 58921443e9b04bbb1866070ed14ea39578302cea Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwi@linutronix.de>
Date: Thu, 18 Jul 2024 15:47:47 +0200
Subject: tools/x86/kcpuid: Parse subleaf ranges if provided

It's a common pattern in cpuid leaves to have the same bitfields format
repeated across a number of subleaves.  Typically, this is used for
enumerating hierarchial structures like cache and TLB levels, CPU
topology levels, etc.

Modify kcpuid.c to handle subleaf ranges in the CSV file subleaves
column.  For example, make it able to parse lines in the form:

 # LEAF, SUBLEAVES,  reg,    bits,    short_name             , ...
    0xb,       1:0,  eax,     4:0,    x2apic_id_shift        , ...
    0xb,       1:0,  ebx,    15:0,    domain_lcpus_count     , ...
    0xb,       1:0,  ecx,     7:0,    domain_nr              , ...

This way, full output can be printed to the user.

Signed-off-by: Ahmed S. Darwish <darwi@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240718134755.378115-8-darwi@linutronix.de
---
 tools/arch/x86/kcpuid/kcpuid.c | 50 +++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c
index 725a7a27fcac..1b25c0a95d3f 100644
--- a/tools/arch/x86/kcpuid/kcpuid.c
+++ b/tools/arch/x86/kcpuid/kcpuid.c
@@ -305,6 +305,8 @@ static int parse_line(char *line)
 	struct bits_desc *bdesc;
 	int reg_index;
 	char *start, *end;
+	u32 subleaf_start, subleaf_end;
+	unsigned bit_start, bit_end;
 
 	/* Skip comments and NULL line */
 	if (line[0] == '#' || line[0] == '\n')
@@ -343,13 +345,25 @@ static int parse_line(char *line)
 		return 0;
 
 	/* subleaf */
-	sub = strtoul(tokens[1], NULL, 0);
-	if ((int)sub > func->nr)
-		return -1;
+	buf = tokens[1];
+	end = strtok(buf, ":");
+	start = strtok(NULL, ":");
+	subleaf_end = strtoul(end, NULL, 0);
+
+	/* A subleaf range is given? */
+	if (start) {
+		subleaf_start = strtoul(start, NULL, 0);
+		subleaf_end = min(subleaf_end, (u32)(func->nr - 1));
+		if (subleaf_start > subleaf_end)
+			return 0;
+	} else {
+		subleaf_start = subleaf_end;
+		if (subleaf_start > (u32)(func->nr - 1))
+			return 0;
+	}
 
-	leaf = &func->leafs[sub];
+	/* register */
 	buf = tokens[2];
-
 	if (strcasestr(buf, "EAX"))
 		reg_index = R_EAX;
 	else if (strcasestr(buf, "EBX"))
@@ -361,23 +375,23 @@ static int parse_line(char *line)
 	else
 		goto err_exit;
 
-	reg = &leaf->info[reg_index];
-	bdesc = &reg->descs[reg->nr++];
-
 	/* bit flag or bits field */
 	buf = tokens[3];
-
 	end = strtok(buf, ":");
-	bdesc->end = strtoul(end, NULL, 0);
-	bdesc->start = bdesc->end;
-
-	/* start != NULL means it is bit fields */
 	start = strtok(NULL, ":");
-	if (start)
-		bdesc->start = strtoul(start, NULL, 0);
-
-	strcpy(bdesc->simp, strtok(tokens[4], " \t"));
-	strcpy(bdesc->detail, tokens[5]);
+	bit_end = strtoul(end, NULL, 0);
+	bit_start = (start) ? strtoul(start, NULL, 0) : bit_end;
+
+	for (sub = subleaf_start; sub <= subleaf_end; sub++) {
+		leaf = &func->leafs[sub];
+		reg = &leaf->info[reg_index];
+		bdesc = &reg->descs[reg->nr++];
+
+		bdesc->end = bit_end;
+		bdesc->start = bit_start;
+		strcpy(bdesc->simp, strtok(tokens[4], " \t"));
+		strcpy(bdesc->detail, tokens[5]);
+	}
 	return 0;
 
 err_exit:
-- 
cgit v1.2.3


From cbbd847d107fb750e62670d0f205a7f58b36f893 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwi@linutronix.de>
Date: Thu, 18 Jul 2024 15:47:48 +0200
Subject: tools/x86/kcpuid: Introduce a complete cpuid bitfields CSV file

For parsing the cpuid bitfields, kcpuid uses an incomplete CSV file with
300+ bitfields.

Use an auto-generated CSV file from the x86-cpuid.org project instead.
It provides complete bitfields coverage: 830+ bitfields, all with proper
descriptions.

The auto-generated file has the following blurb automatically added:

   # SPDX-License-Identifier: CC0-1.0
   # Generator: x86-cpuid-db v1.0

The generator tag includes the project's workspace "git describe"
version string.  It is intended for projects like KernelCI, to aid in
verifying that the auto-generated files have not been tampered with.

The file also has the blurb:

   # Auto-generated file.
   # Please submit all updates and bugfixes to https://x86-cpuid.org

It's thus kindly requested that the Linux kernel's x86 tree maintainers
enforce sending all updates to x86-cpuid.org's upstream database first,
thus benefiting the whole ecosystem.

Signed-off-by: Ahmed S. Darwish <darwi@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://gitlab.com/x86-cpuid.org/x86-cpuid-db/-/blob/v1.0/LICENSE.rst
Link: https://gitlab.com/x86-cpuid.org/x86-cpuid-db
Link: https://lore.kernel.org/all/20240718134755.378115-9-darwi@linutronix.de
---
 tools/arch/x86/kcpuid/cpuid.csv | 1430 +++++++++++++++++++++++++++------------
 1 file changed, 1016 insertions(+), 414 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv
index e0c25b75327e..d751eb8585d0 100644
--- a/tools/arch/x86/kcpuid/cpuid.csv
+++ b/tools/arch/x86/kcpuid/cpuid.csv
@@ -1,451 +1,1053 @@
-# The basic row format is:
-# LEAF, SUBLEAF, register_name, bits, short_name, long_description
-
-# Leaf 00H
-         0,    0,  EAX,   31:0, max_basic_leafs, Max input value for supported subleafs
-
-# Leaf 01H
-         1,    0,  EAX,    3:0, stepping, Stepping ID
-         1,    0,  EAX,    7:4, model, Model
-         1,    0,  EAX,   11:8, family, Family ID
-         1,    0,  EAX,  13:12, processor, Processor Type
-         1,    0,  EAX,  19:16, model_ext, Extended Model ID
-         1,    0,  EAX,  27:20, family_ext, Extended Family ID
-
-         1,    0,  EBX,    7:0, brand, Brand Index
-         1,    0,  EBX,   15:8, clflush_size, CLFLUSH line size (value * 8) in bytes
-         1,    0,  EBX,  23:16, max_cpu_id, Maxim number of addressable logic cpu in this package
-         1,    0,  EBX,  31:24, apic_id, Initial APIC ID
-
-         1,    0,  ECX,      0, sse3, Streaming SIMD Extensions 3(SSE3)
-         1,    0,  ECX,      1, pclmulqdq, PCLMULQDQ instruction supported
-         1,    0,  ECX,      2, dtes64, DS area uses 64-bit layout
-         1,    0,  ECX,      3, mwait, MONITOR/MWAIT supported
-         1,    0,  ECX,      4, ds_cpl, CPL Qualified Debug Store which allows for branch message storage qualified by CPL
-         1,    0,  ECX,      5, vmx, Virtual Machine Extensions supported
-         1,    0,  ECX,      6, smx, Safer Mode Extension supported
-         1,    0,  ECX,      7, eist, Enhanced Intel SpeedStep Technology
-         1,    0,  ECX,      8, tm2, Thermal Monitor 2
-         1,    0,  ECX,      9, ssse3, Supplemental Streaming SIMD Extensions 3 (SSSE3)
-         1,    0,  ECX,     10, l1_ctx_id, L1 data cache could be set to either adaptive mode or shared mode (check IA32_MISC_ENABLE bit 24 definition)
-         1,    0,  ECX,     11, sdbg, IA32_DEBUG_INTERFACE MSR for silicon debug supported
-         1,    0,  ECX,     12, fma, FMA extensions using YMM state supported
-         1,    0,  ECX,     13, cmpxchg16b, 'CMPXCHG16B - Compare and Exchange Bytes' supported
-         1,    0,  ECX,     14, xtpr_update, xTPR Update Control supported
-         1,    0,  ECX,     15, pdcm, Perfmon and Debug Capability present
-         1,    0,  ECX,     17, pcid, Process-Context Identifiers feature present
-         1,    0,  ECX,     18, dca, Prefetching data from a memory mapped device supported
-         1,    0,  ECX,     19, sse4_1, SSE4.1 feature present
-         1,    0,  ECX,     20, sse4_2, SSE4.2 feature present
-         1,    0,  ECX,     21, x2apic, x2APIC supported
-         1,    0,  ECX,     22, movbe, MOVBE instruction supported
-         1,    0,  ECX,     23, popcnt, POPCNT instruction supported
-         1,    0,  ECX,     24, tsc_deadline_timer, LAPIC supports one-shot operation using a TSC deadline value
-         1,    0,  ECX,     25, aesni, AESNI instruction supported
-         1,    0,  ECX,     26, xsave, XSAVE/XRSTOR processor extended states (XSETBV/XGETBV/XCR0)
-         1,    0,  ECX,     27, osxsave, OS has set CR4.OSXSAVE bit to enable XSETBV/XGETBV/XCR0
-         1,    0,  ECX,     28, avx, AVX instruction supported
-         1,    0,  ECX,     29, f16c, 16-bit floating-point conversion instruction supported
-         1,    0,  ECX,     30, rdrand, RDRAND instruction supported
-
-         1,    0,  EDX,      0, fpu, x87 FPU on chip
-         1,    0,  EDX,      1, vme, Virtual-8086 Mode Enhancement
-         1,    0,  EDX,      2, de, Debugging Extensions
-         1,    0,  EDX,      3, pse, Page Size Extensions
-         1,    0,  EDX,      4, tsc, Time Stamp Counter
-         1,    0,  EDX,      5, msr, RDMSR and WRMSR Support
-         1,    0,  EDX,      6, pae, Physical Address Extensions
-         1,    0,  EDX,      7, mce, Machine Check Exception
-         1,    0,  EDX,      8, cx8, CMPXCHG8B instr
-         1,    0,  EDX,      9, apic, APIC on Chip
-         1,    0,  EDX,     11, sep, SYSENTER and SYSEXIT instrs
-         1,    0,  EDX,     12, mtrr, Memory Type Range Registers
-         1,    0,  EDX,     13, pge, Page Global Bit
-         1,    0,  EDX,     14, mca, Machine Check Architecture
-         1,    0,  EDX,     15, cmov, Conditional Move Instrs
-         1,    0,  EDX,     16, pat, Page Attribute Table
-         1,    0,  EDX,     17, pse36, 36-Bit Page Size Extension
-         1,    0,  EDX,     18, psn, Processor Serial Number
-         1,    0,  EDX,     19, clflush, CLFLUSH instr
-#         1,    0,  EDX,     20,
-         1,    0,  EDX,     21, ds, Debug Store
-         1,    0,  EDX,     22, acpi, Thermal Monitor and Software Controlled Clock Facilities
-         1,    0,  EDX,     23, mmx, Intel MMX Technology
-         1,    0,  EDX,     24, fxsr, XSAVE and FXRSTOR Instrs
-         1,    0,  EDX,     25, sse, SSE
-         1,    0,  EDX,     26, sse2, SSE2
-         1,    0,  EDX,     27, ss, Self Snoop
-         1,    0,  EDX,     28, hit, Max APIC IDs
-         1,    0,  EDX,     29, tm, Thermal Monitor
-#         1,    0,  EDX,     30,
-         1,    0,  EDX,     31, pbe, Pending Break Enable
-
-# Leaf 02H
-# cache and TLB descriptor info
-
-# Leaf 03H
-# Precessor Serial Number, introduced on Pentium III, not valid for
-# latest models
-
-# Leaf 04H
-# thread/core and cache topology
-         4,    0,  EAX,    4:0, cache_type, Cache type like instr/data or unified
-         4,    0,  EAX,    7:5, cache_level, Cache Level (starts at 1)
-         4,    0,  EAX,      8, cache_self_init, Cache Self Initialization
-         4,    0,  EAX,      9, fully_associate, Fully Associative cache
-#         4,    0,  EAX,  13:10, resvd, resvd
-         4,    0,  EAX,  25:14, max_logical_id, Max number of addressable IDs for logical processors sharing the cache
-         4,    0,  EAX,  31:26, max_phy_id, Max number of addressable IDs for processors in phy package
-
-         4,    0,  EBX,   11:0, cache_linesize, Size of a cache line in bytes
-         4,    0,  EBX,  21:12, cache_partition, Physical Line partitions
-         4,    0,  EBX,  31:22, cache_ways, Ways of associativity
-         4,    0,  ECX,   31:0, cache_sets, Number of Sets - 1
-         4,    0,  EDX,      0, c_wbinvd, 1 means WBINVD/INVD is not ganranteed to act upon lower level caches of non-originating threads sharing this cache
-         4,    0,  EDX,      1, c_incl, Whether cache is inclusive of lower cache level
-         4,    0,  EDX,      2, c_comp_index, Complex Cache Indexing
-
-# Leaf 05H
-# MONITOR/MWAIT
-	 5,    0,  EAX,   15:0, min_mon_size, Smallest monitor line size in bytes
-	 5,    0,  EBX,   15:0, max_mon_size, Largest monitor line size in bytes
-	 5,    0,  ECX,      0, mwait_ext, Enum of Monitor-Mwait extensions supported
-	 5,    0,  ECX,      1, mwait_irq_break, Largest monitor line size in bytes
-	 5,    0,  EDX,    3:0, c0_sub_stats, Number of C0* sub C-states supported using MWAIT
-	 5,    0,  EDX,    7:4, c1_sub_stats, Number of C1* sub C-states supported using MWAIT
-	 5,    0,  EDX,   11:8, c2_sub_stats, Number of C2* sub C-states supported using MWAIT
-	 5,    0,  EDX,  15:12, c3_sub_stats, Number of C3* sub C-states supported using MWAIT
-	 5,    0,  EDX,  19:16, c4_sub_stats, Number of C4* sub C-states supported using MWAIT
-	 5,    0,  EDX,  23:20, c5_sub_stats, Number of C5* sub C-states supported using MWAIT
-	 5,    0,  EDX,  27:24, c6_sub_stats, Number of C6* sub C-states supported using MWAIT
-	 5,    0,  EDX,  31:28, c7_sub_stats, Number of C7* sub C-states supported using MWAIT
-
-# Leaf 06H
-# Thermal & Power Management
-
-	 6,    0,  EAX,      0, dig_temp, Digital temperature sensor supported
-	 6,    0,  EAX,      1, turbo, Intel Turbo Boost
-	 6,    0,  EAX,      2, arat, Always running APIC timer
-#	 6,    0,  EAX,      3, resv, Reserved
-	 6,    0,  EAX,      4, pln, Power limit notifications supported
-	 6,    0,  EAX,      5, ecmd, Clock modulation duty cycle extension supported
-	 6,    0,  EAX,      6, ptm, Package thermal management supported
-	 6,    0,  EAX,      7, hwp, HWP base register
-	 6,    0,  EAX,      8, hwp_notify, HWP notification
-	 6,    0,  EAX,      9, hwp_act_window, HWP activity window
-	 6,    0,  EAX,     10, hwp_energy, HWP energy performance preference
-	 6,    0,  EAX,     11, hwp_pkg_req, HWP package level request
-#	 6,    0,  EAX,     12, resv, Reserved
-	 6,    0,  EAX,     13, hdc, HDC base registers supported
-	 6,    0,  EAX,     14, turbo3, Turbo Boost Max 3.0
-	 6,    0,  EAX,     15, hwp_cap, Highest Performance change supported
-	 6,    0,  EAX,     16, hwp_peci, HWP PECI override is supported
-	 6,    0,  EAX,     17, hwp_flex, Flexible HWP is supported
-	 6,    0,  EAX,     18, hwp_fast, Fast access mode for the IA32_HWP_REQUEST MSR is supported
-#	 6,    0,  EAX,     19, resv, Reserved
-	 6,    0,  EAX,     20, hwp_ignr, Ignoring Idle Logical Processor HWP request is supported
-
-	 6,    0,  EBX,    3:0, therm_irq_thresh, Number of Interrupt Thresholds in Digital Thermal Sensor
-	 6,    0,  ECX,      0, aperfmperf, Presence of IA32_MPERF and IA32_APERF
-	 6,    0,  ECX,      3, energ_bias, Performance-energy bias preference supported
-
-# Leaf 07H
-#	ECX == 0
-# AVX512 refers to https://en.wikipedia.org/wiki/AVX-512
-# XXX: Do we really need to enumerate each and every AVX512 sub features
-
-	 7,    0,  EBX,      0, fsgsbase, RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE supported
-	 7,    0,  EBX,      1, tsc_adjust, TSC_ADJUST MSR supported
-	 7,    0,  EBX,      2, sgx, Software Guard Extensions
-	 7,    0,  EBX,      3, bmi1, BMI1
-	 7,    0,  EBX,      4, hle, Hardware Lock Elision
-	 7,    0,  EBX,      5, avx2, AVX2
-#	 7,    0,  EBX,      6, fdp_excp_only, x87 FPU Data Pointer updated only on x87 exceptions
-	 7,    0,  EBX,      7, smep, Supervisor-Mode Execution Prevention
-	 7,    0,  EBX,      8, bmi2, BMI2
-	 7,    0,  EBX,      9, rep_movsb, Enhanced REP MOVSB/STOSB
-	 7,    0,  EBX,     10, invpcid, INVPCID instruction
-	 7,    0,  EBX,     11, rtm, Restricted Transactional Memory
-	 7,    0,  EBX,     12, rdt_m, Intel RDT Monitoring capability
-	 7,    0,  EBX,     13, depc_fpu_cs_ds, Deprecates FPU CS and FPU DS
-	 7,    0,  EBX,     14, mpx, Memory Protection Extensions
-	 7,    0,  EBX,     15, rdt_a, Intel RDT Allocation capability
-	 7,    0,  EBX,     16, avx512f, AVX512 Foundation instr
-	 7,    0,  EBX,     17, avx512dq, AVX512 Double and Quadword AVX512 instr
-	 7,    0,  EBX,     18, rdseed, RDSEED instr
-	 7,    0,  EBX,     19, adx, ADX instr
-	 7,    0,  EBX,     20, smap, Supervisor Mode Access Prevention
-	 7,    0,  EBX,     21, avx512ifma, AVX512 Integer Fused Multiply Add
-#	 7,    0,  EBX,     22, resvd, resvd
-	 7,    0,  EBX,     23, clflushopt, CLFLUSHOPT instr
-	 7,    0,  EBX,     24, clwb, CLWB instr
-	 7,    0,  EBX,     25, intel_pt, Intel Processor Trace instr
-	 7,    0,  EBX,     26, avx512pf, Prefetch
-	 7,    0,  EBX,     27, avx512er, AVX512 Exponent Reciproca instr
-	 7,    0,  EBX,     28, avx512cd, AVX512 Conflict Detection instr
-	 7,    0,  EBX,     29, sha, Intel Secure Hash Algorithm Extensions instr
-	 7,    0,  EBX,     30, avx512bw, AVX512 Byte & Word instr
-	 7,    0,  EBX,     31, avx512vl, AVX512 Vector Length Extentions (VL)
-	 7,    0,  ECX,      0, prefetchwt1, X
-	 7,    0,  ECX,      1, avx512vbmi, AVX512 Vector Byte Manipulation Instructions
-	 7,    0,  ECX,      2, umip, User-mode Instruction Prevention
-
-	 7,    0,  ECX,      3, pku, Protection Keys for User-mode pages
-	 7,    0,  ECX,      4, ospke, CR4 PKE set to enable protection keys
-#	 7,    0,  ECX,   16:5, resvd, resvd
-	 7,    0,  ECX,  21:17, mawau, The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode
-	 7,    0,  ECX,     22, rdpid, RDPID and IA32_TSC_AUX
-#	 7,    0,  ECX,  29:23, resvd, resvd
-	 7,    0,  ECX,     30, sgx_lc, SGX Launch Configuration
-#	 7,    0,  ECX,     31, resvd, resvd
-
-# Leaf 08H
-#
-
-
-# Leaf 09H
-# Direct Cache Access (DCA) information
-	 9,    0,  ECX,   31:0, dca_cap, The value of IA32_PLATFORM_DCA_CAP
+# SPDX-License-Identifier: CC0-1.0
+# Generator: x86-cpuid-db v1.0
 
-# Leaf 0AH
-# Architectural Performance Monitoring
 #
-# Do we really need to print out the PMU related stuff?
-# Does normal user really care about it?
+# Auto-generated file.
+# Please submit all updates and bugfixes to https://x86-cpuid.org
 #
-       0xA,    0,  EAX,    7:0, pmu_ver, Performance Monitoring Unit version
-       0xA,    0,  EAX,   15:8, pmu_gp_cnt_num, Numer of general-purose PMU counters per logical CPU
-       0xA,    0,  EAX,  23:16, pmu_cnt_bits, Bit wideth of PMU counter
-       0xA,    0,  EAX,  31:24, pmu_ebx_bits, Length of EBX bit vector to enumerate PMU events
-
-       0xA,    0,  EBX,      0, pmu_no_core_cycle_evt, Core cycle event not available
-       0xA,    0,  EBX,      1, pmu_no_instr_ret_evt, Instruction retired event not available
-       0xA,    0,  EBX,      2, pmu_no_ref_cycle_evt, Reference cycles event not available
-       0xA,    0,  EBX,      3, pmu_no_llc_ref_evt, Last-level cache reference event not available
-       0xA,    0,  EBX,      4, pmu_no_llc_mis_evt, Last-level cache misses event not available
-       0xA,    0,  EBX,      5, pmu_no_br_instr_ret_evt, Branch instruction retired event not available
-       0xA,    0,  EBX,      6, pmu_no_br_mispredict_evt, Branch mispredict retired event not available
-
-       0xA,    0,  ECX,    4:0, pmu_fixed_cnt_num, Performance Monitoring Unit version
-       0xA,    0,  ECX,   12:5, pmu_fixed_cnt_bits, Numer of PMU counters per logical CPU
-
-# Leaf 0BH
-# Extended Topology Enumeration Leaf
-#
-
-       0xB,    0,  EAX,    4:0, id_shift, Number of bits to shift right on x2APIC ID to get a unique topology ID of the next level type
-       0xB,    0,  EBX,   15:0, cpu_nr, Number of logical processors at this level type
-       0xB,    0,  ECX,   15:8, lvl_type, 0-Invalid 1-SMT 2-Core
-       0xB,    0,  EDX,   31:0, x2apic_id, x2APIC ID the current logical processor
-
-
-# Leaf 0DH
-# Processor Extended State
 
-       0xD,    0,  EAX,      0, x87, X87 state
-       0xD,    0,  EAX,      1, sse, SSE state
-       0xD,    0,  EAX,      2, avx, AVX state
-       0xD,    0,  EAX,    4:3, mpx, MPX state
-       0xD,    0,  EAX,    7:5, avx512, AVX-512 state
-       0xD,    0,  EAX,      9, pkru, PKRU state
-
-       0xD,    0,  EBX,   31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0
-       0xD,    0,  ECX,   31:0, max_sz_xsave, Maximum size (bytes) of the XSAVE/XRSTOR save area
-
-       0xD,    1,  EAX,      0, xsaveopt, XSAVEOPT available
-       0xD,    1,  EAX,      1, xsavec, XSAVEC and compacted form supported
-       0xD,    1,  EAX,      2, xgetbv, XGETBV supported
-       0xD,    1,  EAX,      3, xsaves, XSAVES/XRSTORS and IA32_XSS supported
-
-       0xD,    1,  EBX,   31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0
-       0xD,    1,  ECX,      8, pt, PT state
-       0xD,    1,  ECX,      11, cet_usr, CET user state
-       0xD,    1,  ECX,      12, cet_supv, CET supervisor state
-       0xD,    1,  ECX,      13, hdc, HDC state
-       0xD,    1,  ECX,      16, hwp, HWP state
-
-# Leaf 0FH
-# Intel RDT Monitoring
-
-       0xF,    0,  EBX,   31:0, rmid_range, Maximum range (zero-based) of RMID within this physical processor of all types
-       0xF,    0,  EDX,      1, l3c_rdt_mon, L3 Cache RDT Monitoring supported
-
-       0xF,    1,  ECX,   31:0, rmid_range, Maximum range (zero-based) of RMID of this types
-       0xF,    1,  EDX,      0, l3c_ocp_mon, L3 Cache occupancy Monitoring supported
-       0xF,    1,  EDX,      1, l3c_tbw_mon, L3 Cache Total Bandwidth Monitoring supported
-       0xF,    1,  EDX,      2, l3c_lbw_mon, L3 Cache Local Bandwidth Monitoring supported
+# The basic row format is:
+#     LEAF, SUBLEAVES,  reg,    bits,    short_name             , long_description
+
+# Leaf 0H
+# Maximum standard leaf number + CPU vendor string
+
+         0,         0,  eax,    31:0,    max_std_leaf           , Highest cpuid standard leaf supported
+         0,         0,  ebx,    31:0,    cpu_vendorid_0         , CPU vendor ID string bytes 0 - 3
+         0,         0,  ecx,    31:0,    cpu_vendorid_2         , CPU vendor ID string bytes 8 - 11
+         0,         0,  edx,    31:0,    cpu_vendorid_1         , CPU vendor ID string bytes 4 - 7
+
+# Leaf 1H
+# CPU FMS (Family/Model/Stepping) + standard feature flags
+
+         1,         0,  eax,     3:0,    stepping               , Stepping ID
+         1,         0,  eax,     7:4,    base_model             , Base CPU model ID
+         1,         0,  eax,    11:8,    base_family_id         , Base CPU family ID
+         1,         0,  eax,   13:12,    cpu_type               , CPU type
+         1,         0,  eax,   19:16,    ext_model              , Extended CPU model ID
+         1,         0,  eax,   27:20,    ext_family             , Extended CPU family ID
+         1,         0,  ebx,     7:0,    brand_id               , Brand index
+         1,         0,  ebx,    15:8,    clflush_size           , CLFLUSH instruction cache line size
+         1,         0,  ebx,   23:16,    n_logical_cpu          , Logical CPU (HW threads) count
+         1,         0,  ebx,   31:24,    local_apic_id          , Initial local APIC physical ID
+         1,         0,  ecx,       0,    pni                    , Streaming SIMD Extensions 3 (SSE3)
+         1,         0,  ecx,       1,    pclmulqdq              , PCLMULQDQ instruction support
+         1,         0,  ecx,       2,    dtes64                 , 64-bit DS save area
+         1,         0,  ecx,       3,    monitor                , MONITOR/MWAIT support
+         1,         0,  ecx,       4,    ds_cpl                 , CPL Qualified Debug Store
+         1,         0,  ecx,       5,    vmx                    , Virtual Machine Extensions
+         1,         0,  ecx,       6,    smx                    , Safer Mode Extensions
+         1,         0,  ecx,       7,    est                    , Enhanced Intel SpeedStep
+         1,         0,  ecx,       8,    tm2                    , Thermal Monitor 2
+         1,         0,  ecx,       9,    ssse3                  , Supplemental SSE3
+         1,         0,  ecx,      10,    cid                    , L1 Context ID
+         1,         0,  ecx,      11,    sdbg                   , Sillicon Debug
+         1,         0,  ecx,      12,    fma                    , FMA extensions using YMM state
+         1,         0,  ecx,      13,    cx16                   , CMPXCHG16B instruction support
+         1,         0,  ecx,      14,    xtpr                   , xTPR Update Control
+         1,         0,  ecx,      15,    pdcm                   , Perfmon and Debug Capability
+         1,         0,  ecx,      17,    pcid                   , Process-context identifiers
+         1,         0,  ecx,      18,    dca                    , Direct Cache Access
+         1,         0,  ecx,      19,    sse4_1                 , SSE4.1
+         1,         0,  ecx,      20,    sse4_2                 , SSE4.2
+         1,         0,  ecx,      21,    x2apic                 , X2APIC support
+         1,         0,  ecx,      22,    movbe                  , MOVBE instruction support
+         1,         0,  ecx,      23,    popcnt                 , POPCNT instruction support
+         1,         0,  ecx,      24,    tsc_deadline_timer     , APIC timer one-shot operation
+         1,         0,  ecx,      25,    aes                    , AES instructions
+         1,         0,  ecx,      26,    xsave                  , XSAVE (and related instructions) support
+         1,         0,  ecx,      27,    osxsave                , XSAVE (and related instructions) are enabled by OS
+         1,         0,  ecx,      28,    avx                    , AVX instructions support
+         1,         0,  ecx,      29,    f16c                   , Half-precision floating-point conversion support
+         1,         0,  ecx,      30,    rdrand                 , RDRAND instruction support
+         1,         0,  ecx,      31,    guest_status           , System is running as guest; (para-)virtualized system
+         1,         0,  edx,       0,    fpu                    , Floating-Point Unit on-chip (x87)
+         1,         0,  edx,       1,    vme                    , Virtual-8086 Mode Extensions
+         1,         0,  edx,       2,    de                     , Debugging Extensions
+         1,         0,  edx,       3,    pse                    , Page Size Extension
+         1,         0,  edx,       4,    tsc                    , Time Stamp Counter
+         1,         0,  edx,       5,    msr                    , Model-Specific Registers (RDMSR and WRMSR support)
+         1,         0,  edx,       6,    pae                    , Physical Address Extensions
+         1,         0,  edx,       7,    mce                    , Machine Check Exception
+         1,         0,  edx,       8,    cx8                    , CMPXCHG8B instruction
+         1,         0,  edx,       9,    apic                   , APIC on-chip
+         1,         0,  edx,      11,    sep                    , SYSENTER, SYSEXIT, and associated MSRs
+         1,         0,  edx,      12,    mtrr                   , Memory Type Range Registers
+         1,         0,  edx,      13,    pge                    , Page Global Extensions
+         1,         0,  edx,      14,    mca                    , Machine Check Architecture
+         1,         0,  edx,      15,    cmov                   , Conditional Move Instruction
+         1,         0,  edx,      16,    pat                    , Page Attribute Table
+         1,         0,  edx,      17,    pse36                  , Page Size Extension (36-bit)
+         1,         0,  edx,      18,    pn                     , Processor Serial Number
+         1,         0,  edx,      19,    clflush                , CLFLUSH instruction
+         1,         0,  edx,      21,    dts                    , Debug Store
+         1,         0,  edx,      22,    acpi                   , Thermal monitor and clock control
+         1,         0,  edx,      23,    mmx                    , MMX instructions
+         1,         0,  edx,      24,    fxsr                   , FXSAVE and FXRSTOR instructions
+         1,         0,  edx,      25,    sse                    , SSE instructions
+         1,         0,  edx,      26,    sse2                   , SSE2 instructions
+         1,         0,  edx,      27,    ss                     , Self Snoop
+         1,         0,  edx,      28,    ht                     , Hyper-threading
+         1,         0,  edx,      29,    tm                     , Thermal Monitor
+         1,         0,  edx,      30,    ia64                   , Legacy IA-64 (Itanium) support bit, now resreved
+         1,         0,  edx,      31,    pbe                    , Pending Break Enable
+
+# Leaf 2H
+# Intel cache and TLB information one-byte descriptors
+
+         2,         0,  eax,     7:0,    iteration_count        , Number of times this CPUD leaf must be queried
+         2,         0,  eax,    15:8,    desc1                  , Descriptor #1
+         2,         0,  eax,   23:16,    desc2                  , Descriptor #2
+         2,         0,  eax,   30:24,    desc3                  , Descriptor #3
+         2,         0,  eax,      31,    eax_invalid            , Descriptors 1-3 are invalid if set
+         2,         0,  ebx,     7:0,    desc4                  , Descriptor #4
+         2,         0,  ebx,    15:8,    desc5                  , Descriptor #5
+         2,         0,  ebx,   23:16,    desc6                  , Descriptor #6
+         2,         0,  ebx,   30:24,    desc7                  , Descriptor #7
+         2,         0,  ebx,      31,    ebx_invalid            , Descriptors 4-7 are invalid if set
+         2,         0,  ecx,     7:0,    desc8                  , Descriptor #8
+         2,         0,  ecx,    15:8,    desc9                  , Descriptor #9
+         2,         0,  ecx,   23:16,    desc10                 , Descriptor #10
+         2,         0,  ecx,   30:24,    desc11                 , Descriptor #11
+         2,         0,  ecx,      31,    ecx_invalid            , Descriptors 8-11 are invalid if set
+         2,         0,  edx,     7:0,    desc12                 , Descriptor #12
+         2,         0,  edx,    15:8,    desc13                 , Descriptor #13
+         2,         0,  edx,   23:16,    desc14                 , Descriptor #14
+         2,         0,  edx,   30:24,    desc15                 , Descriptor #15
+         2,         0,  edx,      31,    edx_invalid            , Descriptors 12-15 are invalid if set
+
+# Leaf 4H
+# Intel deterministic cache parameters
+
+         4,      31:0,  eax,     4:0,    cache_type             , Cache type field
+         4,      31:0,  eax,     7:5,    cache_level            , Cache level (1-based)
+         4,      31:0,  eax,       8,    cache_self_init        , Self-initialializing cache level
+         4,      31:0,  eax,       9,    fully_associative      , Fully-associative cache
+         4,      31:0,  eax,   25:14,    num_threads_sharing    , Number logical CPUs sharing this cache
+         4,      31:0,  eax,   31:26,    num_cores_on_die       , Number of cores in the physical package
+         4,      31:0,  ebx,    11:0,    cache_linesize         , System coherency line size (0-based)
+         4,      31:0,  ebx,   21:12,    cache_npartitions      , Physical line partitions (0-based)
+         4,      31:0,  ebx,   31:22,    cache_nways            , Ways of associativity (0-based)
+         4,      31:0,  ecx,    30:0,    cache_nsets            , Cache number of sets (0-based)
+         4,      31:0,  edx,       0,    wbinvd_rll_no_guarantee, WBINVD/INVD not guaranteed for Remote Lower-Level caches
+         4,      31:0,  edx,       1,    ll_inclusive           , Cache is inclusive of Lower-Level caches
+         4,      31:0,  edx,       2,    complex_indexing       , Not a direct-mapped cache (complex function)
+
+# Leaf 5H
+# MONITOR/MWAIT instructions enumeration
+
+         5,         0,  eax,    15:0,    min_mon_size           , Smallest monitor-line size, in bytes
+         5,         0,  ebx,    15:0,    max_mon_size           , Largest monitor-line size, in bytes
+         5,         0,  ecx,       0,    mwait_ext              , Enumeration of MONITOR/MWAIT extensions is supported
+         5,         0,  ecx,       1,    mwait_irq_break        , Interrupts as a break-event for MWAIT is supported
+         5,         0,  edx,     3:0,    n_c0_substates         , Number of C0 sub C-states supported using MWAIT
+         5,         0,  edx,     7:4,    n_c1_substates         , Number of C1 sub C-states supported using MWAIT
+         5,         0,  edx,    11:8,    n_c2_substates         , Number of C2 sub C-states supported using MWAIT
+         5,         0,  edx,   15:12,    n_c3_substates         , Number of C3 sub C-states supported using MWAIT
+         5,         0,  edx,   19:16,    n_c4_substates         , Number of C4 sub C-states supported using MWAIT
+         5,         0,  edx,   23:20,    n_c5_substates         , Number of C5 sub C-states supported using MWAIT
+         5,         0,  edx,   27:24,    n_c6_substates         , Number of C6 sub C-states supported using MWAIT
+         5,         0,  edx,   31:28,    n_c7_substates         , Number of C7 sub C-states supported using MWAIT
+
+# Leaf 6H
+# Thermal and Power Management enumeration
+
+         6,         0,  eax,       0,    dtherm                 , Digital temprature sensor
+         6,         0,  eax,       1,    turbo_boost            , Intel Turbo Boost
+         6,         0,  eax,       2,    arat                   , Always-Running APIC Timer (not affected by p-state)
+         6,         0,  eax,       4,    pln                    , Power Limit Notification (PLN) event
+         6,         0,  eax,       5,    ecmd                   , Clock modulation duty cycle extension
+         6,         0,  eax,       6,    pts                    , Package thermal management
+         6,         0,  eax,       7,    hwp                    , HWP (Hardware P-states) base registers are supported
+         6,         0,  eax,       8,    hwp_notify             , HWP notification (IA32_HWP_INTERRUPT MSR)
+         6,         0,  eax,       9,    hwp_act_window         , HWP activity window (IA32_HWP_REQUEST[bits 41:32]) supported
+         6,         0,  eax,      10,    hwp_epp                , HWP Energy Performance Preference
+         6,         0,  eax,      11,    hwp_pkg_req            , HWP Package Level Request
+         6,         0,  eax,      13,    hdc_base_regs          , HDC base registers are supported
+         6,         0,  eax,      14,    turbo_boost_3_0        , Intel Turbo Boost Max 3.0
+         6,         0,  eax,      15,    hwp_capabilities       , HWP Highest Performance change
+         6,         0,  eax,      16,    hwp_peci_override      , HWP PECI override
+         6,         0,  eax,      17,    hwp_flexible           , Flexible HWP
+         6,         0,  eax,      18,    hwp_fast               , IA32_HWP_REQUEST MSR fast access mode
+         6,         0,  eax,      19,    hfi                    , HW_FEEDBACK MSRs supported
+         6,         0,  eax,      20,    hwp_ignore_idle        , Ignoring idle logical CPU HWP req is supported
+         6,         0,  eax,      23,    thread_director        , Intel thread director support
+         6,         0,  eax,      24,    therm_interrupt_bit25  , IA32_THERM_INTERRUPT MSR bit 25 is supported
+         6,         0,  ebx,     3:0,    n_therm_thresholds     , Digital thermometer thresholds
+         6,         0,  ecx,       0,    aperfmperf             , MPERF/APERF MSRs (effective frequency interface)
+         6,         0,  ecx,       3,    epb                    , IA32_ENERGY_PERF_BIAS MSR support
+         6,         0,  ecx,    15:8,    thrd_director_nclasses , Number of classes, Intel thread director
+         6,         0,  edx,       0,    perfcap_reporting      , Performance capability reporting
+         6,         0,  edx,       1,    encap_reporting        , Energy efficiency capability reporting
+         6,         0,  edx,    11:8,    feedback_sz            , HW feedback interface struct size, in 4K pages
+         6,         0,  edx,   31:16,    this_lcpu_hwfdbk_idx   , This logical CPU index @ HW feedback struct, 0-based
+
+# Leaf 7H
+# Extended CPU features enumeration
+
+         7,         0,  eax,    31:0,    leaf7_n_subleaves      , Number of cpuid 0x7 subleaves
+         7,         0,  ebx,       0,    fsgsbase               , FSBASE/GSBASE read/write support
+         7,         0,  ebx,       1,    tsc_adjust             , IA32_TSC_ADJUST MSR supported
+         7,         0,  ebx,       2,    sgx                    , Intel SGX (Software Guard Extensions)
+         7,         0,  ebx,       3,    bmi1                   , Bit manipulation extensions group 1
+         7,         0,  ebx,       4,    hle                    , Hardware Lock Elision
+         7,         0,  ebx,       5,    avx2                   , AVX2 instruction set
+         7,         0,  ebx,       6,    fdp_excptn_only        , FPU Data Pointer updated only on x87 exceptions
+         7,         0,  ebx,       7,    smep                   , Supervisor Mode Execution Protection
+         7,         0,  ebx,       8,    bmi2                   , Bit manipulation extensions group 2
+         7,         0,  ebx,       9,    erms                   , Enhanced REP MOVSB/STOSB
+         7,         0,  ebx,      10,    invpcid                , INVPCID instruction (Invalidate Processor Context ID)
+         7,         0,  ebx,      11,    rtm                    , Intel restricted transactional memory
+         7,         0,  ebx,      12,    cqm                    , Intel RDT-CMT / AMD Platform-QoS cache monitoring
+         7,         0,  ebx,      13,    zero_fcs_fds           , Deprecated FPU CS/DS (stored as zero)
+         7,         0,  ebx,      14,    mpx                    , Intel memory protection extensions
+         7,         0,  ebx,      15,    rdt_a                  , Intel RDT / AMD Platform-QoS Enforcemeent
+         7,         0,  ebx,      16,    avx512f                , AVX-512 foundation instructions
+         7,         0,  ebx,      17,    avx512dq               , AVX-512 double/quadword instructions
+         7,         0,  ebx,      18,    rdseed                 , RDSEED instruction
+         7,         0,  ebx,      19,    adx                    , ADCX/ADOX instructions
+         7,         0,  ebx,      20,    smap                   , Supervisor mode access prevention
+         7,         0,  ebx,      21,    avx512ifma             , AVX-512 integer fused multiply add
+         7,         0,  ebx,      23,    clflushopt             , CLFLUSHOPT instruction
+         7,         0,  ebx,      24,    clwb                   , CLWB instruction
+         7,         0,  ebx,      25,    intel_pt               , Intel processor trace
+         7,         0,  ebx,      26,    avx512pf               , AVX-512 prefetch instructions
+         7,         0,  ebx,      27,    avx512er               , AVX-512 exponent/reciprocal instrs
+         7,         0,  ebx,      28,    avx512cd               , AVX-512 conflict detection instrs
+         7,         0,  ebx,      29,    sha_ni                 , SHA/SHA256 instructions
+         7,         0,  ebx,      30,    avx512bw               , AVX-512 BW (byte/word granular) instructions
+         7,         0,  ebx,      31,    avx512vl               , AVX-512 VL (128/256 vector length) extensions
+         7,         0,  ecx,       0,    prefetchwt1            , PREFETCHWT1 (Intel Xeon Phi only)
+         7,         0,  ecx,       1,    avx512vbmi             , AVX-512 Vector byte manipulation instrs
+         7,         0,  ecx,       2,    umip                   , User mode instruction protection
+         7,         0,  ecx,       3,    pku                    , Protection keys for user-space
+         7,         0,  ecx,       4,    ospke                  , OS protection keys enable
+         7,         0,  ecx,       5,    waitpkg                , WAITPKG instructions
+         7,         0,  ecx,       6,    avx512_vbmi2           , AVX-512 vector byte manipulation instrs group 2
+         7,         0,  ecx,       7,    cet_ss                 , CET shadow stack features
+         7,         0,  ecx,       8,    gfni                   , Galois field new instructions
+         7,         0,  ecx,       9,    vaes                   , Vector AES instrs
+         7,         0,  ecx,      10,    vpclmulqdq             , VPCLMULQDQ 256-bit instruction support
+         7,         0,  ecx,      11,    avx512_vnni            , Vector neural network instructions
+         7,         0,  ecx,      12,    avx512_bitalg          , AVX-512 bit count/shiffle
+         7,         0,  ecx,      13,    tme                    , Intel total memory encryption
+         7,         0,  ecx,      14,    avx512_vpopcntdq       , AVX-512: POPCNT for vectors of DW/QW
+         7,         0,  ecx,      16,    la57                   , 57-bit linear addreses (five-level paging)
+         7,         0,  ecx,   21:17,    mawau_val_lm           , BNDLDX/BNDSTX MAWAU value in 64-bit mode
+         7,         0,  ecx,      22,    rdpid                  , RDPID instruction
+         7,         0,  ecx,      23,    key_locker             , Intel key locker support
+         7,         0,  ecx,      24,    bus_lock_detect        , OS bus-lock detection
+         7,         0,  ecx,      25,    cldemote               , CLDEMOTE instruction
+         7,         0,  ecx,      27,    movdiri                , MOVDIRI instruction
+         7,         0,  ecx,      28,    movdir64b              , MOVDIR64B instruction
+         7,         0,  ecx,      29,    enqcmd                 , Enqueue stores supported (ENQCMD{,S})
+         7,         0,  ecx,      30,    sgx_lc                 , Intel SGX launch configuration
+         7,         0,  ecx,      31,    pks                    , Protection keys for supervisor-mode pages
+         7,         0,  edx,       1,    sgx_keys               , Intel SGX attestation services
+         7,         0,  edx,       2,    avx512_4vnniw          , AVX-512 neural network instructions
+         7,         0,  edx,       3,    avx512_4fmaps          , AVX-512 multiply accumulation single precision
+         7,         0,  edx,       4,    fsrm                   , Fast short REP MOV
+         7,         0,  edx,       5,    uintr                  , CPU supports user interrupts
+         7,         0,  edx,       8,    avx512_vp2intersect    , VP2INTERSECT{D,Q} instructions
+         7,         0,  edx,       9,    srdbs_ctrl             , SRBDS mitigation MSR available
+         7,         0,  edx,      10,    md_clear               , VERW MD_CLEAR microcode support
+         7,         0,  edx,      11,    rtm_always_abort       , XBEGIN (RTM transaction) always aborts
+         7,         0,  edx,      13,    tsx_force_abort        , MSR TSX_FORCE_ABORT, RTM_ABORT bit, supported
+         7,         0,  edx,      14,    serialize              , SERIALIZE instruction
+         7,         0,  edx,      15,    hybrid_cpu             , The CPU is identified as a 'hybrid part'
+         7,         0,  edx,      16,    tsxldtrk               , TSX suspend/resume load address tracking
+         7,         0,  edx,      18,    pconfig                , PCONFIG instruction
+         7,         0,  edx,      19,    arch_lbr               , Intel architectural LBRs
+         7,         0,  edx,      20,    ibt                    , CET indirect branch tracking
+         7,         0,  edx,      22,    amx_bf16               , AMX-BF16: tile bfloat16 support
+         7,         0,  edx,      23,    avx512_fp16            , AVX-512 FP16 instructions
+         7,         0,  edx,      24,    amx_tile               , AMX-TILE: tile architecture support
+         7,         0,  edx,      25,    amx_int8               , AMX-INT8: tile 8-bit integer support
+         7,         0,  edx,      26,    spec_ctrl              , Speculation Control (IBRS/IBPB: indirect branch restrictions)
+         7,         0,  edx,      27,    intel_stibp            , Single thread indirect branch predictors
+         7,         0,  edx,      28,    flush_l1d              , FLUSH L1D cache: IA32_FLUSH_CMD MSR
+         7,         0,  edx,      29,    arch_capabilities      , Intel IA32_ARCH_CAPABILITIES MSR
+         7,         0,  edx,      30,    core_capabilities      , IA32_CORE_CAPABILITIES MSR
+         7,         0,  edx,      31,    spec_ctrl_ssbd         , Speculative store bypass disable
+         7,         1,  eax,       4,    avx_vnni               , AVX-VNNI instructions
+         7,         1,  eax,       5,    avx512_bf16            , AVX-512 bFloat16 instructions
+         7,         1,  eax,       6,    lass                   , Linear address space separation
+         7,         1,  eax,       7,    cmpccxadd              , CMPccXADD instructions
+         7,         1,  eax,       8,    arch_perfmon_ext       , ArchPerfmonExt: CPUID leaf 0x23 is supported
+         7,         1,  eax,      10,    fzrm                   , Fast zero-length REP MOVSB
+         7,         1,  eax,      11,    fsrs                   , Fast short REP STOSB
+         7,         1,  eax,      12,    fsrc                   , Fast Short REP CMPSB/SCASB
+         7,         1,  eax,      17,    fred                   , FRED: Flexible return and event delivery transitions
+         7,         1,  eax,      18,    lkgs                   , LKGS: Load 'kernel' (userspace) GS
+         7,         1,  eax,      19,    wrmsrns                , WRMSRNS instr (WRMSR-non-serializing)
+         7,         1,  eax,      21,    amx_fp16               , AMX-FP16: FP16 tile operations
+         7,         1,  eax,      22,    hreset                 , History reset support
+         7,         1,  eax,      23,    avx_ifma               , Integer fused multiply add
+         7,         1,  eax,      26,    lam                    , Linear address masking
+         7,         1,  eax,      27,    rd_wr_msrlist          , RDMSRLIST/WRMSRLIST instructions
+         7,         1,  ebx,       0,    intel_ppin             , Protected processor inventory number (PPIN{,_CTL} MSRs)
+         7,         1,  edx,       4,    avx_vnni_int8          , AVX-VNNI-INT8 instructions
+         7,         1,  edx,       5,    avx_ne_convert         , AVX-NE-CONVERT instructions
+         7,         1,  edx,       8,    amx_complex            , AMX-COMPLEX instructions (starting from Granite Rapids)
+         7,         1,  edx,      14,    prefetchit_0_1         , PREFETCHIT0/1 instructions
+         7,         1,  edx,      18,    cet_sss                , CET supervisor shadow stacks safe to use
+         7,         2,  edx,       0,    intel_psfd             , Intel predictive store forward disable
+         7,         2,  edx,       1,    ipred_ctrl             , MSR bits IA32_SPEC_CTRL.IPRED_DIS_{U,S}
+         7,         2,  edx,       2,    rrsba_ctrl             , MSR bits IA32_SPEC_CTRL.RRSBA_DIS_{U,S}
+         7,         2,  edx,       3,    ddp_ctrl               , MSR bit  IA32_SPEC_CTRL.DDPD_U
+         7,         2,  edx,       4,    bhi_ctrl               , MSR bit  IA32_SPEC_CTRL.BHI_DIS_S
+         7,         2,  edx,       5,    mcdt_no                , MCDT mitigation not needed
+         7,         2,  edx,       6,    uclock_disable         , UC-lock disable is supported
+
+# Leaf 9H
+# Intel DCA (Direct Cache Access) enumeration
+
+         9,         0,  eax,       0,    dca_enabled_in_bios    , DCA is enabled in BIOS
+
+# Leaf AH
+# Intel PMU (Performance Monitoring Unit) enumeration
+
+       0xa,         0,  eax,     7:0,    pmu_version            , Performance monitoring unit version ID
+       0xa,         0,  eax,    15:8,    pmu_n_gcounters        , Number of general PMU counters per logical CPU
+       0xa,         0,  eax,   23:16,    pmu_gcounters_nbits    , Bitwidth of PMU general counters
+       0xa,         0,  eax,   31:24,    pmu_cpuid_ebx_bits     , Length of cpuid leaf 0xa EBX bit vector
+       0xa,         0,  ebx,       0,    no_core_cycle_evt      , Core cycle event not available
+       0xa,         0,  ebx,       1,    no_insn_retired_evt    , Instruction retired event not available
+       0xa,         0,  ebx,       2,    no_refcycle_evt        , Reference cycles event not available
+       0xa,         0,  ebx,       3,    no_llc_ref_evt         , LLC-reference event not available
+       0xa,         0,  ebx,       4,    no_llc_miss_evt        , LLC-misses event not available
+       0xa,         0,  ebx,       5,    no_br_insn_ret_evt     , Branch instruction retired event not available
+       0xa,         0,  ebx,       6,    no_br_mispredict_evt   , Branch mispredict retired event not available
+       0xa,         0,  ebx,       7,    no_td_slots_evt        , Topdown slots event not available
+       0xa,         0,  ecx,    31:0,    pmu_fcounters_bitmap   , Fixed-function PMU counters support bitmap
+       0xa,         0,  edx,     4:0,    pmu_n_fcounters        , Number of fixed PMU counters
+       0xa,         0,  edx,    12:5,    pmu_fcounters_nbits    , Bitwidth of PMU fixed counters
+       0xa,         0,  edx,      15,    anythread_depr         , AnyThread deprecation
+
+# Leaf BH
+# CPUs v1 extended topology enumeration
+
+       0xb,       1:0,  eax,     4:0,    x2apic_id_shift        , Bit width of this level (previous levels inclusive)
+       0xb,       1:0,  ebx,    15:0,    domain_lcpus_count     , Logical CPUs count across all instances of this domain
+       0xb,       1:0,  ecx,     7:0,    domain_nr              , This domain level (subleaf ID)
+       0xb,       1:0,  ecx,    15:8,    domain_type            , This domain type
+       0xb,       1:0,  edx,    31:0,    x2apic_id              , x2APIC ID of current logical CPU
+
+# Leaf DH
+# Processor extended state enumeration
+
+       0xd,         0,  eax,       0,    xcr0_x87               , XCR0.X87 (bit 0) supported
+       0xd,         0,  eax,       1,    xcr0_sse               , XCR0.SEE (bit 1) supported
+       0xd,         0,  eax,       2,    xcr0_avx               , XCR0.AVX (bit 2) supported
+       0xd,         0,  eax,       3,    xcr0_mpx_bndregs       , XCR0.BNDREGS (bit 3) supported (MPX BND0-BND3 regs)
+       0xd,         0,  eax,       4,    xcr0_mpx_bndcsr        , XCR0.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS regs)
+       0xd,         0,  eax,       5,    xcr0_avx512_opmask     , XCR0.OPMASK (bit 5) supported (AVX-512 k0-k7 regs)
+       0xd,         0,  eax,       6,    xcr0_avx512_zmm_hi256  , XCR0.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 regs)
+       0xd,         0,  eax,       7,    xcr0_avx512_hi16_zmm   , XCR0.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 regs)
+       0xd,         0,  eax,       9,    xcr0_pkru              , XCR0.PKRU (bit 9) supported (XSAVE PKRU reg)
+       0xd,         0,  eax,      11,    xcr0_cet_u             , AMD XCR0.CET_U (bit 11) supported (CET supervisor state)
+       0xd,         0,  eax,      12,    xcr0_cet_s             , AMD XCR0.CET_S (bit 12) support (CET user state)
+       0xd,         0,  eax,      17,    xcr0_tileconfig        , XCR0.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG)
+       0xd,         0,  eax,      18,    xcr0_tiledata          , XCR0.TILEDATA (bit 18) supported (AMX can manage TILEDATA)
+       0xd,         0,  ebx,    31:0,    xsave_sz_xcr0_enabled  , XSAVE/XRSTR area byte size, for XCR0 enabled features
+       0xd,         0,  ecx,    31:0,    xsave_sz_max           , XSAVE/XRSTR area max byte size, all CPU features
+       0xd,         0,  edx,      30,    xcr0_lwp               , AMD XCR0.LWP (bit 62) supported (Light-weight Profiling)
+       0xd,         1,  eax,       0,    xsaveopt               , XSAVEOPT instruction
+       0xd,         1,  eax,       1,    xsavec                 , XSAVEC instruction
+       0xd,         1,  eax,       2,    xgetbv1                , XGETBV instruction with ECX = 1
+       0xd,         1,  eax,       3,    xsaves                 , XSAVES/XRSTORS instructions (and XSS MSR)
+       0xd,         1,  eax,       4,    xfd                    , Extended feature disable support
+       0xd,         1,  ebx,    31:0,    xsave_sz_xcr0_xmms_enabled, XSAVE area size, all XCR0 and XMMS features enabled
+       0xd,         1,  ecx,       8,    xss_pt                 , PT state, supported
+       0xd,         1,  ecx,      10,    xss_pasid              , PASID state, supported
+       0xd,         1,  ecx,      11,    xss_cet_u              , CET user state, supported
+       0xd,         1,  ecx,      12,    xss_cet_p              , CET supervisor state, supported
+       0xd,         1,  ecx,      13,    xss_hdc                , HDC state, supported
+       0xd,         1,  ecx,      14,    xss_uintr              , UINTR state, supported
+       0xd,         1,  ecx,      15,    xss_lbr                , LBR state, supported
+       0xd,         1,  ecx,      16,    xss_hwp                , HWP state, supported
+       0xd,      63:2,  eax,    31:0,    xsave_sz               , Size of save area for subleaf-N feature, in bytes
+       0xd,      63:2,  ebx,    31:0,    xsave_offset           , Offset of save area for subleaf-N feature, in bytes
+       0xd,      63:2,  ecx,       0,    is_xss_bit             , Subleaf N describes an XSS bit, otherwise XCR0 bit
+       0xd,      63:2,  ecx,       1,    compacted_xsave_64byte_aligned, When compacted, subleaf-N feature xsave area is 64-byte aligned
+
+# Leaf FH
+# Intel RDT / AMD PQoS resource monitoring
+
+       0xf,         0,  ebx,    31:0,    core_rmid_max          , RMID max, within this core, all types (0-based)
+       0xf,         0,  edx,       1,    cqm_llc                , LLC QoS-monitoring supported
+       0xf,         1,  eax,     7:0,    l3c_qm_bitwidth        , L3 QoS-monitoring counter bitwidth (24-based)
+       0xf,         1,  eax,       8,    l3c_qm_overflow_bit    , QM_CTR MSR bit 61 is an overflow bit
+       0xf,         1,  ebx,    31:0,    l3c_qm_conver_factor   , QM_CTR MSR conversion factor to bytes
+       0xf,         1,  ecx,    31:0,    l3c_qm_rmid_max        , L3 QoS-monitoring max RMID
+       0xf,         1,  edx,       0,    cqm_occup_llc          , L3 QoS occupancy monitoring supported
+       0xf,         1,  edx,       1,    cqm_mbm_total          , L3 QoS total bandwidth monitoring supported
+       0xf,         1,  edx,       2,    cqm_mbm_local          , L3 QoS local bandwidth monitoring supported
 
 # Leaf 10H
-# Intel RDT Allocation
-
-      0x10,    0,  EBX,      1, l3c_rdt_alloc, L3 Cache Allocation supported
-      0x10,    0,  EBX,      2, l2c_rdt_alloc, L2 Cache Allocation supported
-      0x10,    0,  EBX,      3, mem_bw_alloc, Memory Bandwidth Allocation supported
-
+# Intel RDT / AMD PQoS allocation enumeration
+
+      0x10,         0,  ebx,       1,    cat_l3                 , L3 Cache Allocation Technology supported
+      0x10,         0,  ebx,       2,    cat_l2                 , L2 Cache Allocation Technology supported
+      0x10,         0,  ebx,       3,    mba                    , Memory Bandwidth Allocation supported
+      0x10,       2:1,  eax,     4:0,    cat_cbm_len            , L3/L2_CAT capacity bitmask length, minus-one notation
+      0x10,       2:1,  ebx,    31:0,    cat_units_bitmap       , L3/L2_CAT bitmap of allocation units
+      0x10,       2:1,  ecx,       1,    l3_cat_cos_infreq_updates, L3_CAT COS updates should be infrequent
+      0x10,       2:1,  ecx,       2,    cdp_l3                 , L3/L2_CAT CDP (Code and Data Prioritization)
+      0x10,       2:1,  ecx,       3,    cat_sparse_1s          , L3/L2_CAT non-contiguous 1s value supported
+      0x10,       2:1,  edx,    15:0,    cat_cos_max            , L3/L2_CAT max COS (Class of Service) supported
+      0x10,         3,  eax,    11:0,    mba_max_delay          , Max MBA throttling value; minus-one notation
+      0x10,         3,  ecx,       0,    per_thread_mba         , Per-thread MBA controls are supported
+      0x10,         3,  ecx,       2,    mba_delay_linear       , Delay values are linear
+      0x10,         3,  edx,    15:0,    mba_cos_max            , MBA max Class of Service supported
 
 # Leaf 12H
-# SGX Capability
-#
-# Some detailed SGX features not added yet
-
-      0x12,    0,  EAX,      0, sgx1, L3 Cache Allocation supported
-      0x12,    1,  EAX,      0, sgx2, L3 Cache Allocation supported
-
+# Intel Software Guard Extensions (SGX) enumeration
+
+      0x12,         0,  eax,       0,    sgx1                   , SGX1 leaf functions supported
+      0x12,         0,  eax,       1,    sgx2                   , SGX2 leaf functions supported
+      0x12,         0,  eax,       5,    enclv_leaves           , ENCLV leaves (E{INC,DEC}VIRTCHILD, ESETCONTEXT) supported
+      0x12,         0,  eax,       6,    encls_leaves           , ENCLS leaves (ENCLS ETRACKC, ERDINFO, ELDBC, ELDUC) supported
+      0x12,         0,  eax,       7,    enclu_everifyreport2   , ENCLU leaf EVERIFYREPORT2 supported
+      0x12,         0,  eax,      10,    encls_eupdatesvn       , ENCLS leaf EUPDATESVN supported
+      0x12,         0,  eax,      11,    sgx_edeccssa           , ENCLU leaf EDECCSSA supported
+      0x12,         0,  ebx,       0,    miscselect_exinfo      , SSA.MISC frame: reporting #PF and #GP exceptions inside enclave supported
+      0x12,         0,  ebx,       1,    miscselect_cpinfo      , SSA.MISC frame: reporting #CP exceptions inside enclave supported
+      0x12,         0,  edx,     7:0,    max_enclave_sz_not64   , Maximum enclave size in non-64-bit mode (log2)
+      0x12,         0,  edx,    15:8,    max_enclave_sz_64      , Maximum enclave size in 64-bit mode (log2)
+      0x12,         1,  eax,       0,    secs_attr_init         , ATTRIBUTES.INIT supported (enclave initialized by EINIT)
+      0x12,         1,  eax,       1,    secs_attr_debug        , ATTRIBUTES.DEBUG supported (enclave permits debugger read/write)
+      0x12,         1,  eax,       2,    secs_attr_mode64bit    , ATTRIBUTES.MODE64BIT supported (enclave runs in 64-bit mode)
+      0x12,         1,  eax,       4,    secs_attr_provisionkey , ATTRIBUTES.PROVISIONKEY supported (provisioning key available)
+      0x12,         1,  eax,       5,    secs_attr_einittoken_key, ATTRIBUTES.EINITTOKEN_KEY supported (EINIT token key available)
+      0x12,         1,  eax,       6,    secs_attr_cet          , ATTRIBUTES.CET supported (enable CET attributes)
+      0x12,         1,  eax,       7,    secs_attr_kss          , ATTRIBUTES.KSS supported (Key Separation and Sharing enabled)
+      0x12,         1,  eax,      10,    secs_attr_aexnotify    , ATTRIBUTES.AEXNOTIFY supported (enclave threads may get AEX notifications
+      0x12,         1,  ecx,       0,    xfrm_x87               , Enclave XFRM.X87 (bit 0) supported
+      0x12,         1,  ecx,       1,    xfrm_sse               , Enclave XFRM.SEE (bit 1) supported
+      0x12,         1,  ecx,       2,    xfrm_avx               , Enclave XFRM.AVX (bit 2) supported
+      0x12,         1,  ecx,       3,    xfrm_mpx_bndregs       , Enclave XFRM.BNDREGS (bit 3) supported (MPX BND0-BND3 regs)
+      0x12,         1,  ecx,       4,    xfrm_mpx_bndcsr        , Enclave XFRM.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS regs)
+      0x12,         1,  ecx,       5,    xfrm_avx512_opmask     , Enclave XFRM.OPMASK (bit 5) supported (AVX-512 k0-k7 regs)
+      0x12,         1,  ecx,       6,    xfrm_avx512_zmm_hi256  , Enclave XFRM.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 regs)
+      0x12,         1,  ecx,       7,    xfrm_avx512_hi16_zmm   , Enclave XFRM.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 regs)
+      0x12,         1,  ecx,       9,    xfrm_pkru              , Enclave XFRM.PKRU (bit 9) supported (XSAVE PKRU reg)
+      0x12,         1,  ecx,      17,    xfrm_tileconfig        , Enclave XFRM.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG)
+      0x12,         1,  ecx,      18,    xfrm_tiledata          , Enclave XFRM.TILEDATA (bit 18) supported (AMX can manage TILEDATA)
+      0x12,      31:2,  eax,     3:0,    subleaf_type           , Subleaf type (dictates output layout)
+      0x12,      31:2,  eax,   31:12,    epc_sec_base_addr_0    , EPC section base addr, bits[12:31]
+      0x12,      31:2,  ebx,    19:0,    epc_sec_base_addr_1    , EPC section base addr, bits[32:51]
+      0x12,      31:2,  ecx,     3:0,    epc_sec_type           , EPC section type / property encoding
+      0x12,      31:2,  ecx,   31:12,    epc_sec_size_0         , EPC section size, bits[12:31]
+      0x12,      31:2,  edx,    19:0,    epc_sec_size_1         , EPC section size, bits[32:51]
 
 # Leaf 14H
-# Intel Processor Tracer
-#
+# Intel Processor Trace enumeration
+
+      0x14,         0,  eax,    31:0,    pt_max_subleaf         , Max cpuid 0x14 subleaf
+      0x14,         0,  ebx,       0,    cr3_filtering          , IA32_RTIT_CR3_MATCH is accessible
+      0x14,         0,  ebx,       1,    psb_cyc                , Configurable PSB and cycle-accurate mode
+      0x14,         0,  ebx,       2,    ip_filtering           , IP/TraceStop filtering; Warm-reset PT MSRs preservation
+      0x14,         0,  ebx,       3,    mtc_timing             , MTC timing packet; COFI-based packets suppression
+      0x14,         0,  ebx,       4,    ptwrite                , PTWRITE support
+      0x14,         0,  ebx,       5,    power_event_trace      , Power Event Trace support
+      0x14,         0,  ebx,       6,    psb_pmi_preserve       , PSB and PMI preservation support
+      0x14,         0,  ebx,       7,    event_trace            , Event Trace packet generation through IA32_RTIT_CTL.EventEn
+      0x14,         0,  ebx,       8,    tnt_disable            , TNT packet generation disable through IA32_RTIT_CTL.DisTNT
+      0x14,         0,  ecx,       0,    topa_output            , ToPA output scheme support
+      0x14,         0,  ecx,       1,    topa_multiple_entries  , ToPA tables can hold multiple entries
+      0x14,         0,  ecx,       2,    single_range_output    , Single-range output scheme supported
+      0x14,         0,  ecx,       3,    trance_transport_output, Trace Transport subsystem output support
+      0x14,         0,  ecx,      31,    ip_payloads_lip        , IP payloads have LIP values (CS base included)
+      0x14,         1,  eax,     2:0,    num_address_ranges     , Filtering number of configurable Address Ranges
+      0x14,         1,  eax,   31:16,    mtc_periods_bmp        , Bitmap of supported MTC period encodings
+      0x14,         1,  ebx,    15:0,    cycle_thresholds_bmp   , Bitmap of supported Cycle Threshold encodings
+      0x14,         1,  ebx,   31:16,    psb_periods_bmp        , Bitmap of supported Configurable PSB frequency encodings
 
 # Leaf 15H
-# Time Stamp Counter and Nominal Core Crystal Clock Information
+# Intel TSC (Time Stamp Counter) enumeration
 
-      0x15,    0,  EAX,   31:0, tsc_denominator, The denominator of the TSC/”core crystal clock” ratio
-      0x15,    0,  EBX,   31:0, tsc_numerator, The numerator of the TSC/”core crystal clock” ratio
-      0x15,    0,  ECX,   31:0, nom_freq, Nominal frequency of the core crystal clock in Hz
+      0x15,         0,  eax,    31:0,    tsc_denominator        , Denominator of the TSC/'core crystal clock' ratio
+      0x15,         0,  ebx,    31:0,    tsc_numerator          , Numerator of the TSC/'core crystal clock' ratio
+      0x15,         0,  ecx,    31:0,    cpu_crystal_hz         , Core crystal clock nominal frequency, in Hz
 
 # Leaf 16H
-# Processor Frequency Information
+# Intel processor fequency enumeration
 
-      0x16,    0,  EAX,   15:0, cpu_base_freq, Processor Base Frequency in MHz
-      0x16,    0,  EBX,   15:0, cpu_max_freq, Maximum Frequency in MHz
-      0x16,    0,  ECX,   15:0, bus_freq, Bus (Reference) Frequency in MHz
+      0x16,         0,  eax,    15:0,    cpu_base_mhz           , Processor base frequency, in MHz
+      0x16,         0,  ebx,    15:0,    cpu_max_mhz            , Processor max frequency, in MHz
+      0x16,         0,  ecx,    15:0,    bus_mhz                , Bus reference frequency, in MHz
 
 # Leaf 17H
-# System-On-Chip Vendor Attribute
-
-      0x17,    0,  EAX,   31:0, max_socid, Maximum input value of supported sub-leaf
-      0x17,    0,  EBX,   15:0, soc_vid, SOC Vendor ID
-      0x17,    0,  EBX,     16, std_vid, SOC Vendor ID is assigned via an industry standard scheme
-      0x17,    0,  ECX,   31:0, soc_pid, SOC Project ID assigned by vendor
-      0x17,    0,  EDX,   31:0, soc_sid, SOC Stepping ID
+# Intel SoC vendor attributes enumeration
+
+      0x17,         0,  eax,    31:0,    soc_max_subleaf        , Max cpuid leaf 0x17 subleaf
+      0x17,         0,  ebx,    15:0,    soc_vendor_id          , SoC vendor ID
+      0x17,         0,  ebx,      16,    is_vendor_scheme       , Assigned by industry enumaeratoion scheme (not Intel)
+      0x17,         0,  ecx,    31:0,    soc_proj_id            , SoC project ID, assigned by vendor
+      0x17,         0,  edx,    31:0,    soc_stepping_id        , Soc project stepping ID, assigned by vendor
+      0x17,       3:1,  eax,    31:0,    vendor_brand_a         , Vendor Brand ID string, bytes subleaf_nr * (0 -> 3)
+      0x17,       3:1,  ebx,    31:0,    vendor_brand_b         , Vendor Brand ID string, bytes subleaf_nr * (4 -> 7)
+      0x17,       3:1,  ecx,    31:0,    vendor_brand_c         , Vendor Brand ID string, bytes subleaf_nr * (8 -> 11)
+      0x17,       3:1,  edx,    31:0,    vendor_brand_d         , Vendor Brand ID string, bytes subleaf_nr * (12 -> 15)
 
 # Leaf 18H
-# Deterministic Address Translation Parameters
-
+# Intel determenestic address translation (TLB) parameters
+
+      0x18,      31:0,  eax,    31:0,    tlb_max_subleaf        , Max cpuid 0x18 subleaf
+      0x18,      31:0,  ebx,       0,    tlb_4k_page            , TLB 4KB-page entries supported
+      0x18,      31:0,  ebx,       1,    tlb_2m_page            , TLB 2MB-page entries supported
+      0x18,      31:0,  ebx,       2,    tlb_4m_page            , TLB 4MB-page entries supported
+      0x18,      31:0,  ebx,       3,    tlb_1g_page            , TLB 1GB-page entries supported
+      0x18,      31:0,  ebx,    10:8,    hard_partitioning      , (Hard/Soft) partitioning between logical CPUs sharing this struct
+      0x18,      31:0,  ebx,   31:16,    n_way_associative      , Ways of associativity
+      0x18,      31:0,  ecx,    31:0,    n_sets                 , Number of sets
+      0x18,      31:0,  edx,     4:0,    tlb_type               , Translation cache type (TLB type)
+      0x18,      31:0,  edx,     7:5,    tlb_cache_level        , Translation cache level (1-based)
+      0x18,      31:0,  edx,       8,    is_fully_associative   , Fully-associative structure
+      0x18,      31:0,  edx,   25:14,    tlb_max_addressible_ids, Max num of addressible IDs for logical CPUs sharing this TLB - 1
 
 # Leaf 19H
-# Key Locker Leaf
+# Intel Key Locker enumeration
 
+      0x19,         0,  eax,       0,    kl_cpl0_only           , CPL0-only key Locker restriction supported
+      0x19,         0,  eax,       1,    kl_no_encrypt          , No-encrypt key locker restriction supported
+      0x19,         0,  eax,       2,    kl_no_decrypt          , No-decrypt key locker restriction supported
+      0x19,         0,  ebx,       0,    aes_keylocker          , AES key locker instructions supported
+      0x19,         0,  ebx,       2,    aes_keylocker_wide     , AES wide key locker instructions supported
+      0x19,         0,  ebx,       4,    kl_msr_iwkey           , Key locker MSRs and IWKEY backups supported
+      0x19,         0,  ecx,       0,    loadiwkey_no_backup    , LOADIWKEY NoBackup parameter supported
+      0x19,         0,  ecx,       1,    iwkey_rand             , IWKEY randomization (KeySource encoding 1) supported
 
 # Leaf 1AH
-# Hybrid Information
-
-      0x1A,    0,  EAX,  31:24, core_type, 20H-Intel_Atom 40H-Intel_Core
-
+# Intel hybrid CPUs identification (e.g. Atom, Core)
+
+      0x1a,         0,  eax,    23:0,    core_native_model      , This core's native model ID
+      0x1a,         0,  eax,   31:24,    core_type              , This core's type
+
+# Leaf 1BH
+# Intel PCONFIG (Platform configuration) enumeration
+
+      0x1b,      31:0,  eax,    11:0,    pconfig_subleaf_type   , CPUID 0x1b subleaf type
+      0x1b,      31:0,  ebx,    31:0,    pconfig_target_id_x    , A supported PCONFIG target ID
+      0x1b,      31:0,  ecx,    31:0,    pconfig_target_id_y    , A supported PCONFIG target ID
+      0x1b,      31:0,  edx,    31:0,    pconfig_target_id_z    , A supported PCONFIG target ID
+
+# Leaf 1CH
+# Intel LBR (Last Branch Record) enumeration
+
+      0x1c,         0,  eax,       0,    lbr_depth_8            , Max stack depth (number of LBR entries) = 8
+      0x1c,         0,  eax,       1,    lbr_depth_16           , Max stack depth (number of LBR entries) = 16
+      0x1c,         0,  eax,       2,    lbr_depth_24           , Max stack depth (number of LBR entries) = 24
+      0x1c,         0,  eax,       3,    lbr_depth_32           , Max stack depth (number of LBR entries) = 32
+      0x1c,         0,  eax,       4,    lbr_depth_40           , Max stack depth (number of LBR entries) = 40
+      0x1c,         0,  eax,       5,    lbr_depth_48           , Max stack depth (number of LBR entries) = 48
+      0x1c,         0,  eax,       6,    lbr_depth_56           , Max stack depth (number of LBR entries) = 56
+      0x1c,         0,  eax,       7,    lbr_depth_64           , Max stack depth (number of LBR entries) = 64
+      0x1c,         0,  eax,      30,    lbr_deep_c_reset       , LBRs maybe cleared on MWAIT C-state > C1
+      0x1c,         0,  eax,      31,    lbr_ip_is_lip          , LBR IP contain Last IP, otherwise effective IP
+      0x1c,         0,  ebx,       0,    lbr_cpl                , CPL filtering (non-zero IA32_LBR_CTL[2:1]) supported
+      0x1c,         0,  ebx,       1,    lbr_branch_filter      , Branch filtering (non-zero IA32_LBR_CTL[22:16]) supported
+      0x1c,         0,  ebx,       2,    lbr_call_stack         , Call-stack mode (IA32_LBR_CTL[3] = 1) supported
+      0x1c,         0,  ecx,       0,    lbr_mispredict         , Branch misprediction bit supported (IA32_LBR_x_INFO[63])
+      0x1c,         0,  ecx,       1,    lbr_timed_lbr          , Timed LBRs (CPU cycles since last LBR entry) supported
+      0x1c,         0,  ecx,       2,    lbr_branch_type        , Branch type field (IA32_LBR_INFO_x[59:56]) supported
+      0x1c,         0,  ecx,   19:16,    lbr_events_gpc_bmp     , LBR PMU-events logging support; bitmap for first 4 GP (general-purpose) Counters
+
+# Leaf 1DH
+# Intel AMX (Advanced Matrix Extensions) tile information
+
+      0x1d,         0,  eax,    31:0,    amx_max_palette        , Highest palette ID / subleaf ID
+      0x1d,         1,  eax,    15:0,    amx_palette_size       , AMX palette total tiles size, in bytes
+      0x1d,         1,  eax,   31:16,    amx_tile_size          , AMX single tile's size, in bytes
+      0x1d,         1,  ebx,    15:0,    amx_tile_row_size      , AMX tile single row's size, in bytes
+      0x1d,         1,  ebx,   31:16,    amx_palette_nr_tiles   , AMX palette number of tiles
+      0x1d,         1,  ecx,    15:0,    amx_tile_nr_rows       , AMX tile max number of rows
+
+# Leaf 1EH
+# Intel AMX, TMUL (Tile-matrix MULtiply) accelerator unit enumeration
+
+      0x1e,         0,  ebx,     7:0,    tmul_maxk              , TMUL unit maximum height, K (rows or columns)
+      0x1e,         0,  ebx,    23:8,    tmul_maxn              , TMUL unit maxiumum SIMD dimension, N (column bytes)
 
 # Leaf 1FH
-# V2 Extended Topology - A preferred superset to leaf 0BH
-
-
-# According to SDM
-# 40000000H - 4FFFFFFFH is invalid range
+# Intel extended topology enumeration v2
+
+      0x1f,       5:0,  eax,     4:0,    x2apic_id_shift        , Bit width of this level (previous levels inclusive)
+      0x1f,       5:0,  ebx,    15:0,    domain_lcpus_count     , Logical CPUs count across all instances of this domain
+      0x1f,       5:0,  ecx,     7:0,    domain_level           , This domain level (subleaf ID)
+      0x1f,       5:0,  ecx,    15:8,    domain_type            , This domain type
+      0x1f,       5:0,  edx,    31:0,    x2apic_id              , x2APIC ID of current logical CPU
+
+# Leaf 20H
+# Intel HRESET (History Reset) enumeration
+
+      0x20,         0,  eax,    31:0,    hreset_nr_subleaves    , CPUID 0x20 max subleaf + 1
+      0x20,         0,  ebx,       0,    hreset_thread_director , HRESET of Intel thread director is supported
+
+# Leaf 21H
+# Intel TD (Trust Domain) guest execution environment enumeration
+
+      0x21,         0,  ebx,    31:0,    tdx_vendorid_0         , TDX vendor ID string bytes 0 - 3
+      0x21,         0,  ecx,    31:0,    tdx_vendorid_2         , CPU vendor ID string bytes 8 - 11
+      0x21,         0,  edx,    31:0,    tdx_vendorid_1         , CPU vendor ID string bytes 4 - 7
+
+# Leaf 23H
+# Intel Architectural Performance Monitoring Extended (ArchPerfmonExt)
+
+      0x23,         0,  eax,       1,    subleaf_1_counters     , Subleaf 1, PMU counters bitmaps, is valid
+      0x23,         0,  eax,       3,    subleaf_3_events       , Subleaf 3, PMU events bitmaps, is valid
+      0x23,         0,  ebx,       0,    unitmask2              , IA32_PERFEVTSELx MSRs UnitMask2 is supported
+      0x23,         0,  ebx,       1,    zbit                   , IA32_PERFEVTSELx MSRs Z-bit is supported
+      0x23,         1,  eax,    31:0,    pmu_gp_counters_bitmap , General-purpose PMU counters bitmap
+      0x23,         1,  ebx,    31:0,    pmu_f_counters_bitmap  , Fixed PMU counters bitmap
+      0x23,         3,  eax,       0,    core_cycles_evt        , Core cycles event supported
+      0x23,         3,  eax,       1,    insn_retired_evt       , Instructions retired event supported
+      0x23,         3,  eax,       2,    ref_cycles_evt         , Reference cycles event supported
+      0x23,         3,  eax,       3,    llc_refs_evt           , Last-level cache references event supported
+      0x23,         3,  eax,       4,    llc_misses_evt         , Last-level cache misses event supported
+      0x23,         3,  eax,       5,    br_insn_ret_evt        , Branch instruction retired event supported
+      0x23,         3,  eax,       6,    br_mispr_evt           , Branch mispredict retired event supported
+      0x23,         3,  eax,       7,    td_slots_evt           , Topdown slots event supported
+      0x23,         3,  eax,       8,    td_backend_bound_evt   , Topdown backend bound event supported
+      0x23,         3,  eax,       9,    td_bad_spec_evt        , Topdown bad speculation event supported
+      0x23,         3,  eax,      10,    td_frontend_bound_evt  , Topdown frontend bound event supported
+      0x23,         3,  eax,      11,    td_retiring_evt        , Topdown retiring event support
+
+# Leaf 40000000H
+# Maximum hypervisor standard leaf + hypervisor vendor string
+
+0x40000000,         0,  eax,    31:0,    max_hyp_leaf           , Maximum hypervisor standard leaf number
+0x40000000,         0,  ebx,    31:0,    hypervisor_id_0        , Hypervisor ID string bytes 0 - 3
+0x40000000,         0,  ecx,    31:0,    hypervisor_id_1        , Hypervisor ID string bytes 4 - 7
+0x40000000,         0,  edx,    31:0,    hypervisor_id_2        , Hypervisor ID string bytes 8 - 11
+
+# Leaf 80000000H
+# Maximum extended leaf number + CPU vendor string (AMD)
+
+0x80000000,         0,  eax,    31:0,    max_ext_leaf           , Maximum extended cpuid leaf supported
+0x80000000,         0,  ebx,    31:0,    cpu_vendorid_0         , Vendor ID string bytes 0 - 3
+0x80000000,         0,  ecx,    31:0,    cpu_vendorid_2         , Vendor ID string bytes 8 - 11
+0x80000000,         0,  edx,    31:0,    cpu_vendorid_1         , Vendor ID string bytes 4 - 7
 
 # Leaf 80000001H
-# Extended Processor Signature and Feature Bits
-
-0x80000001,    0,  EAX,  27:20, extfamily, Extended family
-0x80000001,    0,  EAX,  19:16, extmodel, Extended model
-0x80000001,    0,  EAX,   11:8, basefamily, Description of Family
-0x80000001,    0,  EAX,   11:8, basemodel, Model numbers vary with product
-0x80000001,    0,  EAX,    3:0, stepping, Processor stepping (revision) for a specific model
-
-0x80000001,    0,  EBX,  31:28, pkgtype, Specifies the package type
-
-0x80000001,    0,  ECX,      0, lahf_lm, LAHF/SAHF available in 64-bit mode
-0x80000001,    0,  ECX,      1, cmplegacy, Core multi-processing legacy mode
-0x80000001,    0,  ECX,      2, svm, Indicates support for: VMRUN, VMLOAD, VMSAVE, CLGI, VMMCALL, and INVLPGA
-0x80000001,    0,  ECX,      3, extapicspace, Extended APIC register space
-0x80000001,    0,  ECX,      4, altmovecr8, Indicates support for LOCK MOV CR0 means MOV CR8
-0x80000001,    0,  ECX,      5, lzcnt, LZCNT
-0x80000001,    0,  ECX,      6, sse4a, EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support
-0x80000001,    0,  ECX,      7, misalignsse, Misaligned SSE Mode
-0x80000001,    0,  ECX,      8, prefetchw, PREFETCHW
-0x80000001,    0,  ECX,      9, osvw, OS Visible Work-around support
-0x80000001,    0,  ECX,     10, ibs, Instruction Based Sampling
-0x80000001,    0,  ECX,     11, xop, Extended operation support
-0x80000001,    0,  ECX,     12, skinit, SKINIT and STGI support
-0x80000001,    0,  ECX,     13, wdt, Watchdog timer support
-0x80000001,    0,  ECX,     15, lwp, Lightweight profiling support
-0x80000001,    0,  ECX,     16, fma4, Four-operand FMA instruction support
-0x80000001,    0,  ECX,     17, tce, Translation cache extension
-0x80000001,    0,  ECX,     22, TopologyExtensions, Indicates support for Core::X86::Cpuid::CachePropEax0 and Core::X86::Cpuid::ExtApicId
-0x80000001,    0,  ECX,     23, perfctrextcore, Indicates support for Core::X86::Msr::PERF_CTL0 - 5 and Core::X86::Msr::PERF_CTR
-0x80000001,    0,  ECX,     24, perfctrextdf, Indicates support for Core::X86::Msr::DF_PERF_CTL and Core::X86::Msr::DF_PERF_CTR
-0x80000001,    0,  ECX,     26, databreakpointextension, Indicates data breakpoint support for Core::X86::Msr::DR0_ADDR_MASK, Core::X86::Msr::DR1_ADDR_MASK, Core::X86::Msr::DR2_ADDR_MASK and Core::X86::Msr::DR3_ADDR_MASK
-0x80000001,    0,  ECX,     27, perftsc, Performance time-stamp counter supported
-0x80000001,    0,  ECX,     28, perfctrextllc, Indicates support for L3 performance counter extensions
-0x80000001,    0,  ECX,     29, mwaitextended, MWAITX and MONITORX capability is supported
-0x80000001,    0,  ECX,     30, admskextn, Indicates support for address mask extension (to 32 bits and to all 4 DRs) for instruction breakpoints
-
-0x80000001,    0,  EDX,      0, fpu, x87 floating point unit on-chip
-0x80000001,    0,  EDX,      1, vme, Virtual-mode enhancements
-0x80000001,    0,  EDX,      2, de, Debugging extensions, IO breakpoints, CR4.DE
-0x80000001,    0,  EDX,      3, pse, Page-size extensions (4 MB pages)
-0x80000001,    0,  EDX,      4, tsc, Time stamp counter, RDTSC/RDTSCP instructions, CR4.TSD
-0x80000001,    0,  EDX,      5, msr, Model-specific registers (MSRs), with RDMSR and WRMSR instructions
-0x80000001,    0,  EDX,      6, pae, Physical-address extensions (PAE)
-0x80000001,    0,  EDX,      7, mce, Machine Check Exception, CR4.MCE
-0x80000001,    0,  EDX,      8, cmpxchg8b, CMPXCHG8B instruction
-0x80000001,    0,  EDX,      9, apic, advanced programmable interrupt controller (APIC) exists and is enabled
-0x80000001,    0,  EDX,     11, sysret, SYSCALL/SYSRET supported
-0x80000001,    0,  EDX,     12, mtrr, Memory-type range registers
-0x80000001,    0,  EDX,     13, pge, Page global extension, CR4.PGE
-0x80000001,    0,  EDX,     14, mca, Machine check architecture, MCG_CAP
-0x80000001,    0,  EDX,     15, cmov, Conditional move instructions, CMOV, FCOMI, FCMOV
-0x80000001,    0,  EDX,     16, pat, Page attribute table
-0x80000001,    0,  EDX,     17, pse36, Page-size extensions
-0x80000001,    0,  EDX,     20, exec_dis, Execute Disable Bit available
-0x80000001,    0,  EDX,     22, mmxext, AMD extensions to MMX instructions
-0x80000001,    0,  EDX,     23, mmx, MMX instructions
-0x80000001,    0,  EDX,     24, fxsr, FXSAVE and FXRSTOR instructions
-0x80000001,    0,  EDX,     25, ffxsr, FXSAVE and FXRSTOR instruction optimizations
-0x80000001,    0,  EDX,     26, 1gb_page, 1GB page supported
-0x80000001,    0,  EDX,     27, rdtscp, RDTSCP and IA32_TSC_AUX are available
-0x80000001,    0,  EDX,     29, lm, 64b Architecture supported
-0x80000001,    0,  EDX,     30, threednowext, AMD extensions to 3DNow! instructions
-0x80000001,    0,  EDX,     31, threednow, 3DNow! instructions
-
-# Leaf 80000002H/80000003H/80000004H
-# Processor Brand String
+# Extended CPU feature identifiers
+
+0x80000001,         0,  eax,     3:0,    e_stepping_id          , Stepping ID
+0x80000001,         0,  eax,     7:4,    e_base_model           , Base processor model
+0x80000001,         0,  eax,    11:8,    e_base_family          , Base processor family
+0x80000001,         0,  eax,   19:16,    e_ext_model            , Extended processor model
+0x80000001,         0,  eax,   27:20,    e_ext_family           , Extended processor family
+0x80000001,         0,  ebx,    15:0,    brand_id               , Brand ID
+0x80000001,         0,  ebx,   31:28,    pkg_type               , Package type
+0x80000001,         0,  ecx,       0,    lahf_lm                , LAHF and SAHF in 64-bit mode
+0x80000001,         0,  ecx,       1,    cmp_legacy             , Multi-processing legacy mode (No HT)
+0x80000001,         0,  ecx,       2,    svm                    , Secure Virtual Machine
+0x80000001,         0,  ecx,       3,    extapic                , Extended APIC space
+0x80000001,         0,  ecx,       4,    cr8_legacy             , LOCK MOV CR0 means MOV CR8
+0x80000001,         0,  ecx,       5,    abm                    , LZCNT advanced bit manipulation
+0x80000001,         0,  ecx,       6,    sse4a                  , SSE4A support
+0x80000001,         0,  ecx,       7,    misalignsse            , Misaligned SSE mode
+0x80000001,         0,  ecx,       8,    3dnowprefetch          , 3DNow PREFETCH/PREFETCHW support
+0x80000001,         0,  ecx,       9,    osvw                   , OS visible workaround
+0x80000001,         0,  ecx,      10,    ibs                    , Instruction based sampling
+0x80000001,         0,  ecx,      11,    xop                    , XOP: extended operation (AVX instructions)
+0x80000001,         0,  ecx,      12,    skinit                 , SKINIT/STGI support
+0x80000001,         0,  ecx,      13,    wdt                    , Watchdog timer support
+0x80000001,         0,  ecx,      15,    lwp                    , Lightweight profiling
+0x80000001,         0,  ecx,      16,    fma4                   , 4-operand FMA instruction
+0x80000001,         0,  ecx,      17,    tce                    , Translation cache extension
+0x80000001,         0,  ecx,      19,    nodeid_msr             , NodeId MSR (0xc001100c)
+0x80000001,         0,  ecx,      21,    tbm                    , Trailing bit manipulations
+0x80000001,         0,  ecx,      22,    topoext                , Topology Extensions (cpuid leaf 0x8000001d)
+0x80000001,         0,  ecx,      23,    perfctr_core           , Core performance counter extensions
+0x80000001,         0,  ecx,      24,    perfctr_nb             , NB/DF performance counter extensions
+0x80000001,         0,  ecx,      26,    bpext                  , Data access breakpoint extension
+0x80000001,         0,  ecx,      27,    ptsc                   , Performance time-stamp counter
+0x80000001,         0,  ecx,      28,    perfctr_llc            , LLC (L3) performance counter extensions
+0x80000001,         0,  ecx,      29,    mwaitx                 , MWAITX/MONITORX support
+0x80000001,         0,  ecx,      30,    addr_mask_ext          , Breakpoint address mask extension (to bit 31)
+0x80000001,         0,  edx,       0,    e_fpu                  , Floating-Point Unit on-chip (x87)
+0x80000001,         0,  edx,       1,    e_vme                  , Virtual-8086 Mode Extensions
+0x80000001,         0,  edx,       2,    e_de                   , Debugging Extensions
+0x80000001,         0,  edx,       3,    e_pse                  , Page Size Extension
+0x80000001,         0,  edx,       4,    e_tsc                  , Time Stamp Counter
+0x80000001,         0,  edx,       5,    e_msr                  , Model-Specific Registers (RDMSR and WRMSR support)
+0x80000001,         0,  edx,       6,    pae                    , Physical Address Extensions
+0x80000001,         0,  edx,       7,    mce                    , Machine Check Exception
+0x80000001,         0,  edx,       8,    cx8                    , CMPXCHG8B instruction
+0x80000001,         0,  edx,       9,    apic                   , APIC on-chip
+0x80000001,         0,  edx,      11,    syscall                , SYSCALL and SYSRET instructions
+0x80000001,         0,  edx,      12,    mtrr                   , Memory Type Range Registers
+0x80000001,         0,  edx,      13,    pge                    , Page Global Extensions
+0x80000001,         0,  edx,      14,    mca                    , Machine Check Architecture
+0x80000001,         0,  edx,      15,    cmov                   , Conditional Move Instruction
+0x80000001,         0,  edx,      16,    pat                    , Page Attribute Table
+0x80000001,         0,  edx,      17,    pse36                  , Page Size Extension (36-bit)
+0x80000001,         0,  edx,      19,    mp                     , Out-of-spec AMD Multiprocessing bit
+0x80000001,         0,  edx,      20,    nx                     , No-execute page protection
+0x80000001,         0,  edx,      22,    mmxext                 , AMD MMX extensions
+0x80000001,         0,  edx,      24,    e_fxsr                 , FXSAVE and FXRSTOR instructions
+0x80000001,         0,  edx,      25,    fxsr_opt               , FXSAVE and FXRSTOR optimizations
+0x80000001,         0,  edx,      26,    pdpe1gb                , 1-GB large page support
+0x80000001,         0,  edx,      27,    rdtscp                 , RDTSCP instruction
+0x80000001,         0,  edx,      29,    lm                     , Long mode (x86-64, 64-bit support)
+0x80000001,         0,  edx,      30,    3dnowext               , AMD 3DNow extensions
+0x80000001,         0,  edx,      31,    3dnow                  , 3DNow instructions
+
+# Leaf 80000002H
+# CPU brand ID string, bytes 0 - 15
+
+0x80000002,         0,  eax,    31:0,    cpu_brandid_0          , CPU brand ID string, bytes 0 - 3
+0x80000002,         0,  ebx,    31:0,    cpu_brandid_1          , CPU brand ID string, bytes 4 - 7
+0x80000002,         0,  ecx,    31:0,    cpu_brandid_2          , CPU brand ID string, bytes 8 - 11
+0x80000002,         0,  edx,    31:0,    cpu_brandid_3          , CPU brand ID string, bytes 12 - 15
+
+# Leaf 80000003H
+# CPU brand ID string, bytes 16 - 31
+
+0x80000003,         0,  eax,    31:0,    cpu_brandid_4          , CPU brand ID string bytes, 16 - 19
+0x80000003,         0,  ebx,    31:0,    cpu_brandid_5          , CPU brand ID string bytes, 20 - 23
+0x80000003,         0,  ecx,    31:0,    cpu_brandid_6          , CPU brand ID string bytes, 24 - 27
+0x80000003,         0,  edx,    31:0,    cpu_brandid_7          , CPU brand ID string bytes, 28 - 31
+
+# Leaf 80000004H
+# CPU brand ID string, bytes 32 - 47
+
+0x80000004,         0,  eax,    31:0,    cpu_brandid_8          , CPU brand ID string, bytes 32 - 35
+0x80000004,         0,  ebx,    31:0,    cpu_brandid_9          , CPU brand ID string, bytes 36 - 39
+0x80000004,         0,  ecx,    31:0,    cpu_brandid_10         , CPU brand ID string, bytes 40 - 43
+0x80000004,         0,  edx,    31:0,    cpu_brandid_11         , CPU brand ID string, bytes 44 - 47
 
 # Leaf 80000005H
-# Reserved
+# AMD L1 cache and L1 TLB enumeration
+
+0x80000005,         0,  eax,     7:0,    l1_itlb_2m_4m_nentries , L1 ITLB #entires, 2M and 4M pages
+0x80000005,         0,  eax,    15:8,    l1_itlb_2m_4m_assoc    , L1 ITLB associativity, 2M and 4M pages
+0x80000005,         0,  eax,   23:16,    l1_dtlb_2m_4m_nentries , L1 DTLB #entires, 2M and 4M pages
+0x80000005,         0,  eax,   31:24,    l1_dtlb_2m_4m_assoc    , L1 DTLB associativity, 2M and 4M pages
+0x80000005,         0,  ebx,     7:0,    l1_itlb_4k_nentries    , L1 ITLB #entries, 4K pages
+0x80000005,         0,  ebx,    15:8,    l1_itlb_4k_assoc       , L1 ITLB associativity, 4K pages
+0x80000005,         0,  ebx,   23:16,    l1_dtlb_4k_nentries    , L1 DTLB #entries, 4K pages
+0x80000005,         0,  ebx,   31:24,    l1_dtlb_4k_assoc       , L1 DTLB associativity, 4K pages
+0x80000005,         0,  ecx,     7:0,    l1_dcache_line_size    , L1 dcache line size, in bytes
+0x80000005,         0,  ecx,    15:8,    l1_dcache_nlines       , L1 dcache lines per tag
+0x80000005,         0,  ecx,   23:16,    l1_dcache_assoc        , L1 dcache associativity
+0x80000005,         0,  ecx,   31:24,    l1_dcache_size_kb      , L1 dcache size, in KB
+0x80000005,         0,  edx,     7:0,    l1_icache_line_size    , L1 icache line size, in bytes
+0x80000005,         0,  edx,    15:8,    l1_icache_nlines       , L1 icache lines per tag
+0x80000005,         0,  edx,   23:16,    l1_icache_assoc        , L1 icache associativity
+0x80000005,         0,  edx,   31:24,    l1_icache_size_kb      , L1 icache size, in KB
 
 # Leaf 80000006H
-# Extended L2 Cache Features
-
-0x80000006,    0,  ECX,    7:0, clsize, Cache Line size in bytes
-0x80000006,    0,  ECX,  15:12, l2c_assoc, L2 Associativity
-0x80000006,    0,  ECX,  31:16, csize, Cache size in 1K units
-
+# (Mostly AMD) L2 TLB, L2 cache, and L3 cache enumeration
+
+0x80000006,         0,  eax,    11:0,    l2_itlb_2m_4m_nentries , L2 iTLB #entries, 2M and 4M pages
+0x80000006,         0,  eax,   15:12,    l2_itlb_2m_4m_assoc    , L2 iTLB associativity, 2M and 4M pages
+0x80000006,         0,  eax,   27:16,    l2_dtlb_2m_4m_nentries , L2 dTLB #entries, 2M and 4M pages
+0x80000006,         0,  eax,   31:28,    l2_dtlb_2m_4m_assoc    , L2 dTLB associativity, 2M and 4M pages
+0x80000006,         0,  ebx,    11:0,    l2_itlb_4k_nentries    , L2 iTLB #entries, 4K pages
+0x80000006,         0,  ebx,   15:12,    l2_itlb_4k_assoc       , L2 iTLB associativity, 4K pages
+0x80000006,         0,  ebx,   27:16,    l2_dtlb_4k_nentries    , L2 dTLB #entries, 4K pages
+0x80000006,         0,  ebx,   31:28,    l2_dtlb_4k_assoc       , L2 dTLB associativity, 4K pages
+0x80000006,         0,  ecx,     7:0,    l2_line_size           , L2 cache line size, in bytes
+0x80000006,         0,  ecx,    11:8,    l2_nlines              , L2 cache number of lines per tag
+0x80000006,         0,  ecx,   15:12,    l2_assoc               , L2 cache associativity
+0x80000006,         0,  ecx,   31:16,    l2_size_kb             , L2 cache size, in KB
+0x80000006,         0,  edx,     7:0,    l3_line_size           , L3 cache line size, in bytes
+0x80000006,         0,  edx,    11:8,    l3_nlines              , L3 cache number of lines per tag
+0x80000006,         0,  edx,   15:12,    l3_assoc               , L3 cache associativity
+0x80000006,         0,  edx,   31:18,    l3_size_range          , L3 cache size range
 
 # Leaf 80000007H
-
-0x80000007,    0,  EDX,      8, nonstop_tsc, Invariant TSC available
-
+# CPU power management (mostly AMD) and AMD RAS enumeration
+
+0x80000007,         0,  ebx,       0,    overflow_recov         , MCA overflow conditions not fatal
+0x80000007,         0,  ebx,       1,    succor                 , Software containment of UnCORRectable errors
+0x80000007,         0,  ebx,       2,    hw_assert              , Hardware assert MSRs
+0x80000007,         0,  ebx,       3,    smca                   , Scalable MCA (MCAX MSRs)
+0x80000007,         0,  ecx,    31:0,    cpu_pwr_sample_ratio   , CPU power sample time ratio
+0x80000007,         0,  edx,       0,    digital_temp           , Digital temprature sensor
+0x80000007,         0,  edx,       1,    powernow_freq_id       , PowerNOW! frequency scaling
+0x80000007,         0,  edx,       2,    powernow_volt_id       , PowerNOW! voltage scaling
+0x80000007,         0,  edx,       3,    thermal_trip           , THERMTRIP (Thermal Trip)
+0x80000007,         0,  edx,       4,    hw_thermal_control     , Hardware thermal control
+0x80000007,         0,  edx,       5,    sw_thermal_control     , Software thermal control
+0x80000007,         0,  edx,       6,    100mhz_steps           , 100 MHz multiplier control
+0x80000007,         0,  edx,       7,    hw_pstate              , Hardware P-state control
+0x80000007,         0,  edx,       8,    constant_tsc           , TSC ticks at constant rate across all P and C states
+0x80000007,         0,  edx,       9,    cpb                    , Core performance boost
+0x80000007,         0,  edx,      10,    eff_freq_ro            , Read-only effective frequency interface
+0x80000007,         0,  edx,      11,    proc_feedback          , Processor feedback interface (deprecated)
+0x80000007,         0,  edx,      12,    acc_power              , Processor power reporting interface
+0x80000007,         0,  edx,      13,    connected_standby      , CPU Connected Standby support
+0x80000007,         0,  edx,      14,    rapl                   , Runtime Average Power Limit interface
 
 # Leaf 80000008H
-
-0x80000008,    0,  EAX,    7:0, phy_adr_bits, Physical Address Bits
-0x80000008,    0,  EAX,   15:8, lnr_adr_bits, Linear Address Bits
-0x80000007,    0,  EBX,      9, wbnoinvd, WBNOINVD
-
-# 0x8000001E
-# EAX: Extended APIC ID
-0x8000001E,	0, EAX,   31:0, extended_apic_id, Extended APIC ID
-# EBX: Core Identifiers
-0x8000001E,	0, EBX,    7:0, core_id, Identifies the logical core ID
-0x8000001E,	0, EBX,   15:8, threads_per_core, The number of threads per core is threads_per_core + 1
-# ECX: Node Identifiers
-0x8000001E,	0, ECX,    7:0, node_id, Node ID
-0x8000001E,	0, ECX,   10:8, nodes_per_processor, Nodes per processor { 0: 1 node, else reserved }
-
-# 8000001F: AMD Secure Encryption
-0x8000001F,	0, EAX,	     0, sme,	Secure Memory Encryption
-0x8000001F,	0, EAX,      1, sev,	Secure Encrypted Virtualization
-0x8000001F,	0, EAX,      2, vmpgflush, VM Page Flush MSR
-0x8000001F,	0, EAX,      3, seves, SEV Encrypted State
-0x8000001F,	0, EBX,    5:0, c-bit, Page table bit number used to enable memory encryption
-0x8000001F,	0, EBX,   11:6, mem_encrypt_physaddr_width, Reduction of physical address space in bits with SME enabled
-0x8000001F,	0, ECX,   31:0, num_encrypted_guests, Maximum ASID value that may be used for an SEV-enabled guest
-0x8000001F,	0, EDX,   31:0, minimum_sev_asid, Minimum ASID value that must be used for an SEV-enabled, SEV-ES-disabled guest
+# CPU capacity parameters and extended feature flags (mostly AMD)
+
+0x80000008,         0,  eax,     7:0,    phys_addr_bits         , Max physical address bits
+0x80000008,         0,  eax,    15:8,    virt_addr_bits         , Max virtual address bits
+0x80000008,         0,  eax,   23:16,    guest_phys_addr_bits   , Max nested-paging guest physical address bits
+0x80000008,         0,  ebx,       0,    clzero                 , CLZERO supported
+0x80000008,         0,  ebx,       1,    irperf                 , Instruction retired counter MSR
+0x80000008,         0,  ebx,       2,    xsaveerptr             , XSAVE/XRSTOR always saves/restores FPU error pointers
+0x80000008,         0,  ebx,       3,    invlpgb                , INVLPGB broadcasts a TLB invalidate to all threads
+0x80000008,         0,  ebx,       4,    rdpru                  , RDPRU (Read Processor Register at User level) supported
+0x80000008,         0,  ebx,       6,    mba                    , Memory Bandwidth Allocation (AMD bit)
+0x80000008,         0,  ebx,       8,    mcommit                , MCOMMIT (Memory commit) supported
+0x80000008,         0,  ebx,       9,    wbnoinvd               , WBNOINVD supported
+0x80000008,         0,  ebx,      12,    amd_ibpb               , Indirect Branch Prediction Barrier
+0x80000008,         0,  ebx,      13,    wbinvd_int             , Interruptible WBINVD/WBNOINVD
+0x80000008,         0,  ebx,      14,    amd_ibrs               , Indirect Branch Restricted Speculation
+0x80000008,         0,  ebx,      15,    amd_stibp              , Single Thread Indirect Branch Prediction mode
+0x80000008,         0,  ebx,      16,    ibrs_always_on         , IBRS always-on preferred
+0x80000008,         0,  ebx,      17,    amd_stibp_always_on    , STIBP always-on preferred
+0x80000008,         0,  ebx,      18,    ibrs_fast              , IBRS is preferred over software solution
+0x80000008,         0,  ebx,      19,    ibrs_same_mode         , IBRS provides same mode protection
+0x80000008,         0,  ebx,      20,    no_efer_lmsle          , EFER[LMSLE] bit (Long-Mode Segment Limit Enable) unsupported
+0x80000008,         0,  ebx,      21,    tlb_flush_nested       , INVLPGB RAX[5] bit can be set (nested translations)
+0x80000008,         0,  ebx,      23,    amd_ppin               , Protected Processor Inventory Number
+0x80000008,         0,  ebx,      24,    amd_ssbd               , Speculative Store Bypass Disable
+0x80000008,         0,  ebx,      25,    virt_ssbd              , virtualized SSBD (Speculative Store Bypass Disable)
+0x80000008,         0,  ebx,      26,    amd_ssb_no             , SSBD not needed (fixed in HW)
+0x80000008,         0,  ebx,      27,    cppc                   , Collaborative Processor Performance Control
+0x80000008,         0,  ebx,      28,    amd_psfd               , Predictive Store Forward Disable
+0x80000008,         0,  ebx,      29,    btc_no                 , CPU not affected by Branch Type Confusion
+0x80000008,         0,  ebx,      30,    ibpb_ret               , IBPB clears RSB/RAS too
+0x80000008,         0,  ebx,      31,    brs                    , Branch Sampling supported
+0x80000008,         0,  ecx,     7:0,    cpu_nthreads           , Number of physical threads - 1
+0x80000008,         0,  ecx,   15:12,    apicid_coreid_len      , Number of thread core ID bits (shift) in APIC ID
+0x80000008,         0,  ecx,   17:16,    perf_tsc_len           , Performance time-stamp counter size
+0x80000008,         0,  edx,    15:0,    invlpgb_max_pages      , INVLPGB maximum page count
+0x80000008,         0,  edx,   31:16,    rdpru_max_reg_id       , RDPRU max register ID (ECX input)
+
+# Leaf 8000000AH
+# AMD SVM (Secure Virtual Machine) enumeration
+
+0x8000000a,         0,  eax,     7:0,    svm_version            , SVM revision number
+0x8000000a,         0,  ebx,    31:0,    svm_nasid              , Number of address space identifiers (ASID)
+0x8000000a,         0,  edx,       0,    npt                    , Nested paging
+0x8000000a,         0,  edx,       1,    lbrv                   , LBR virtualization
+0x8000000a,         0,  edx,       2,    svm_lock               , SVM lock
+0x8000000a,         0,  edx,       3,    nrip_save              , NRIP save support on #VMEXIT
+0x8000000a,         0,  edx,       4,    tsc_scale              , MSR based TSC rate control
+0x8000000a,         0,  edx,       5,    vmcb_clean             , VMCB clean bits support
+0x8000000a,         0,  edx,       6,    flushbyasid            , Flush by ASID + Extended VMCB TLB_Control
+0x8000000a,         0,  edx,       7,    decodeassists          , Decode Assists support
+0x8000000a,         0,  edx,      10,    pausefilter            , Pause intercept filter
+0x8000000a,         0,  edx,      12,    pfthreshold            , Pause filter threshold
+0x8000000a,         0,  edx,      13,    avic                   , Advanced virtual interrupt controller
+0x8000000a,         0,  edx,      15,    v_vmsave_vmload        , Virtual VMSAVE/VMLOAD (nested virt)
+0x8000000a,         0,  edx,      16,    vgif                   , Virtualize the Global Interrupt Flag
+0x8000000a,         0,  edx,      17,    gmet                   , Guest mode execution trap
+0x8000000a,         0,  edx,      18,    x2avic                 , Virtual x2APIC
+0x8000000a,         0,  edx,      19,    sss_check              , Supervisor Shadow Stack restrictions
+0x8000000a,         0,  edx,      20,    v_spec_ctrl            , Virtual SPEC_CTRL
+0x8000000a,         0,  edx,      21,    ro_gpt                 , Read-Only guest page table support
+0x8000000a,         0,  edx,      23,    h_mce_override         , Host MCE override
+0x8000000a,         0,  edx,      24,    tlbsync_int            , TLBSYNC intercept + INVLPGB/TLBSYNC in VMCB
+0x8000000a,         0,  edx,      25,    vnmi                   , NMI virtualization
+0x8000000a,         0,  edx,      26,    ibs_virt               , IBS Virtualization
+0x8000000a,         0,  edx,      27,    ext_lvt_off_chg        , Extended LVT offset fault change
+0x8000000a,         0,  edx,      28,    svme_addr_chk          , Guest SVME addr check
+
+# Leaf 80000019H
+# AMD TLB 1G-pages enumeration
+
+0x80000019,         0,  eax,    11:0,    l1_itlb_1g_nentries    , L1 iTLB #entries, 1G pages
+0x80000019,         0,  eax,   15:12,    l1_itlb_1g_assoc       , L1 iTLB associativity, 1G pages
+0x80000019,         0,  eax,   27:16,    l1_dtlb_1g_nentries    , L1 dTLB #entries, 1G pages
+0x80000019,         0,  eax,   31:28,    l1_dtlb_1g_assoc       , L1 dTLB associativity, 1G pages
+0x80000019,         0,  ebx,    11:0,    l2_itlb_1g_nentries    , L2 iTLB #entries, 1G pages
+0x80000019,         0,  ebx,   15:12,    l2_itlb_1g_assoc       , L2 iTLB associativity, 1G pages
+0x80000019,         0,  ebx,   27:16,    l2_dtlb_1g_nentries    , L2 dTLB #entries, 1G pages
+0x80000019,         0,  ebx,   31:28,    l2_dtlb_1g_assoc       , L2 dTLB associativity, 1G pages
+
+# Leaf 8000001AH
+# AMD instruction optimizations enumeration
+
+0x8000001a,         0,  eax,       0,    fp_128                 , Internal FP/SIMD exec data path is 128-bits wide
+0x8000001a,         0,  eax,       1,    movu_preferred         , SSE: MOVU* better than MOVL*/MOVH*
+0x8000001a,         0,  eax,       2,    fp_256                 , internal FP/SSE exec data path is 256-bits wide
+
+# Leaf 8000001BH
+# AMD IBS (Instruction-Based Sampling) enumeration
+
+0x8000001b,         0,  eax,       0,    ibs_flags_valid        , IBS feature flags valid
+0x8000001b,         0,  eax,       1,    ibs_fetch_sampling     , IBS fetch sampling supported
+0x8000001b,         0,  eax,       2,    ibs_op_sampling        , IBS execution sampling supported
+0x8000001b,         0,  eax,       3,    ibs_rdwr_op_counter    , IBS read/write of op counter supported
+0x8000001b,         0,  eax,       4,    ibs_op_count           , IBS OP counting mode supported
+0x8000001b,         0,  eax,       5,    ibs_branch_target      , IBS branch target address reporting supported
+0x8000001b,         0,  eax,       6,    ibs_op_counters_ext    , IBS IbsOpCurCnt/IbsOpMaxCnt extend by 7 bits
+0x8000001b,         0,  eax,       7,    ibs_rip_invalid_chk    , IBS invalid RIP indication supported
+0x8000001b,         0,  eax,       8,    ibs_op_branch_fuse     , IBS fused branch micro-op indication supported
+0x8000001b,         0,  eax,       9,    ibs_fetch_ctl_ext      , IBS Fetch Control Extended MSR (0xc001103c) supported
+0x8000001b,         0,  eax,      10,    ibs_op_data_4          , IBS op data 4 MSR supported
+0x8000001b,         0,  eax,      11,    ibs_l3_miss_filter     , IBS L3-miss filtering supported (Zen4+)
+
+# Leaf 8000001CH
+# AMD LWP (Lightweight Profiling)
+
+0x8000001c,         0,  eax,       0,    os_lwp_avail           , LWP is available to application programs (supported by OS)
+0x8000001c,         0,  eax,       1,    os_lpwval              , LWPVAL instruction (EventId=1) is supported by OS
+0x8000001c,         0,  eax,       2,    os_lwp_ire             , Instructions Retired Event (EventId=2) is supported by OS
+0x8000001c,         0,  eax,       3,    os_lwp_bre             , Branch Retired Event (EventId=3) is supported by OS
+0x8000001c,         0,  eax,       4,    os_lwp_dme             , DCache Miss Event (EventId=4) is supported by OS
+0x8000001c,         0,  eax,       5,    os_lwp_cnh             , CPU Clocks Not Halted event (EventId=5) is supported by OS
+0x8000001c,         0,  eax,       6,    os_lwp_rnh             , CPU Reference clocks Not Halted event (EventId=6) is supported by OS
+0x8000001c,         0,  eax,      29,    os_lwp_cont            , LWP sampling in continuous mode is supported by OS
+0x8000001c,         0,  eax,      30,    os_lwp_ptsc            , Performance Time Stamp Counter in event records is supported by OS
+0x8000001c,         0,  eax,      31,    os_lwp_int             , Interrupt on threshold overflow is supported by OS
+0x8000001c,         0,  ebx,     7:0,    lwp_lwpcb_sz           , LWP Control Block size, in quadwords
+0x8000001c,         0,  ebx,    15:8,    lwp_event_sz           , LWP event record size, in bytes
+0x8000001c,         0,  ebx,   23:16,    lwp_max_events         , LWP max supported EventId value (EventID 255 not included)
+0x8000001c,         0,  ebx,   31:24,    lwp_event_offset       , LWP events area offset in the LWP Control Block
+0x8000001c,         0,  ecx,     4:0,    lwp_latency_max        , Num of bits in cache latency counters (10 to 31)
+0x8000001c,         0,  ecx,       5,    lwp_data_adddr         , Cache miss events report the data address of the reference
+0x8000001c,         0,  ecx,     8:6,    lwp_latency_rnd        , Amount by which cache latency is rounded
+0x8000001c,         0,  ecx,    15:9,    lwp_version            , LWP implementation version
+0x8000001c,         0,  ecx,   23:16,    lwp_buf_min_sz         , LWP event ring buffer min size, in units of 32 event records
+0x8000001c,         0,  ecx,      28,    lwp_branch_predict     , Branches Retired events can be filtered
+0x8000001c,         0,  ecx,      29,    lwp_ip_filtering       , IP filtering (IPI, IPF, BaseIP, and LimitIP @ LWPCP) supported
+0x8000001c,         0,  ecx,      30,    lwp_cache_levels       , Cache-related events can be filtered by cache level
+0x8000001c,         0,  ecx,      31,    lwp_cache_latency      , Cache-related events can be filtered by latency
+0x8000001c,         0,  edx,       0,    hw_lwp_avail           , LWP is available in Hardware
+0x8000001c,         0,  edx,       1,    hw_lpwval              , LWPVAL instruction (EventId=1) is available in HW
+0x8000001c,         0,  edx,       2,    hw_lwp_ire             , Instructions Retired Event (EventId=2) is available in HW
+0x8000001c,         0,  edx,       3,    hw_lwp_bre             , Branch Retired Event (EventId=3) is available in HW
+0x8000001c,         0,  edx,       4,    hw_lwp_dme             , DCache Miss Event (EventId=4) is available in HW
+0x8000001c,         0,  edx,       5,    hw_lwp_cnh             , CPU Clocks Not Halted event (EventId=5) is available in HW
+0x8000001c,         0,  edx,       6,    hw_lwp_rnh             , CPU Reference clocks Not Halted event (EventId=6) is available in HW
+0x8000001c,         0,  edx,      29,    hw_lwp_cont            , LWP sampling in continuous mode is available in HW
+0x8000001c,         0,  edx,      30,    hw_lwp_ptsc            , Performance Time Stamp Counter in event records is available in HW
+0x8000001c,         0,  edx,      31,    hw_lwp_int             , Interrupt on threshold overflow is available in HW
+
+# Leaf 8000001DH
+# AMD deterministic cache parameters
+
+0x8000001d,      31:0,  eax,     4:0,    cache_type             , Cache type field
+0x8000001d,      31:0,  eax,     7:5,    cache_level            , Cache level (1-based)
+0x8000001d,      31:0,  eax,       8,    cache_self_init        , Self-initializing cache level
+0x8000001d,      31:0,  eax,       9,    fully_associative      , Fully-associative cache
+0x8000001d,      31:0,  eax,   25:14,    num_threads_sharing    , Number of logical CPUs sharing cache
+0x8000001d,      31:0,  ebx,    11:0,    cache_linesize         , System coherency line size (0-based)
+0x8000001d,      31:0,  ebx,   21:12,    cache_npartitions      , Physical line partitions (0-based)
+0x8000001d,      31:0,  ebx,   31:22,    cache_nways            , Ways of associativity (0-based)
+0x8000001d,      31:0,  ecx,    30:0,    cache_nsets            , Cache number of sets (0-based)
+0x8000001d,      31:0,  edx,       0,    wbinvd_rll_no_guarantee, WBINVD/INVD not guaranteed for Remote Lower-Level caches
+0x8000001d,      31:0,  edx,       1,    ll_inclusive           , Cache is inclusive of Lower-Level caches
+
+# Leaf 8000001EH
+# AMD CPU topology enumeration
+
+0x8000001e,         0,  eax,    31:0,    ext_apic_id            , Extended APIC ID
+0x8000001e,         0,  ebx,     7:0,    core_id                , Unique per-socket logical core unit ID
+0x8000001e,         0,  ebx,    15:8,    core_nthreas           , #Threads per core (zero-based)
+0x8000001e,         0,  ecx,     7:0,    node_id                , Node (die) ID of invoking logical CPU
+0x8000001e,         0,  ecx,    10:8,    nnodes_per_socket      , #nodes in invoking logical CPU's package/socket
+
+# Leaf 8000001FH
+# AMD encrypted memory capabilities enumeration (SME/SEV)
+
+0x8000001f,         0,  eax,       0,    sme                    , Secure Memory Encryption supported
+0x8000001f,         0,  eax,       1,    sev                    , Secure Encrypted Virtualization supported
+0x8000001f,         0,  eax,       2,    vm_page_flush          , VM Page Flush MSR (0xc001011e) available
+0x8000001f,         0,  eax,       3,    sev_es                 , SEV Encrypted State supported
+0x8000001f,         0,  eax,       4,    sev_nested_paging      , SEV secure nested paging supported
+0x8000001f,         0,  eax,       5,    vm_permission_levels   , VMPL supported
+0x8000001f,         0,  eax,       6,    rpmquery               , RPMQUERY instruction supported
+0x8000001f,         0,  eax,       7,    vmpl_sss               , VMPL supervisor shadwo stack supported
+0x8000001f,         0,  eax,       8,    secure_tsc             , Secure TSC supported
+0x8000001f,         0,  eax,       9,    v_tsc_aux              , Hardware virtualizes TSC_AUX
+0x8000001f,         0,  eax,      10,    sme_coherent           , HW enforces cache coherency across encryption domains
+0x8000001f,         0,  eax,      11,    req_64bit_hypervisor   , SEV guest mandates 64-bit hypervisor
+0x8000001f,         0,  eax,      12,    restricted_injection   , Restricted Injection supported
+0x8000001f,         0,  eax,      13,    alternate_injection    , Alternate Injection supported
+0x8000001f,         0,  eax,      14,    debug_swap             , SEV-ES: full debug state swap is supported
+0x8000001f,         0,  eax,      15,    disallow_host_ibs      , SEV-ES: Disallowing IBS use by the host is supported
+0x8000001f,         0,  eax,      16,    virt_transparent_enc   , Virtual Transparent Encryption
+0x8000001f,         0,  eax,      17,    vmgexit_paremeter      , VmgexitParameter is supported in SEV_FEATURES
+0x8000001f,         0,  eax,      18,    virt_tom_msr           , Virtual TOM MSR is supported
+0x8000001f,         0,  eax,      19,    virt_ibs               , IBS state virtualization is supported for SEV-ES guests
+0x8000001f,         0,  eax,      24,    vmsa_reg_protection    , VMSA register protection is supported
+0x8000001f,         0,  eax,      25,    smt_protection         , SMT protection is supported
+0x8000001f,         0,  eax,      28,    svsm_page_msr          , SVSM communication page MSR (0xc001f000h) is supported
+0x8000001f,         0,  eax,      29,    nested_virt_snp_msr    , VIRT_RMPUPDATE/VIRT_PSMASH MSRs are supported
+0x8000001f,         0,  ebx,     5:0,    pte_cbit_pos           , PTE bit number used to enable memory encryption
+0x8000001f,         0,  ebx,    11:6,    phys_addr_reduction_nbits, Reduction of phys address space when encryption is enabled, in bits
+0x8000001f,         0,  ebx,   15:12,    vmpl_count             , Number of VM permission levels (VMPL) supported
+0x8000001f,         0,  ecx,    31:0,    enc_guests_max         , Max supported number of simultaneous encrypted guests
+0x8000001f,         0,  edx,    31:0,    min_sev_asid_no_sev_es , Mininum ASID for SEV-enabled SEV-ES-disabled guest
+
+# Leaf 80000020H
+# AMD Platform QoS extended feature IDs
+
+0x80000020,         0,  ebx,       1,    mba                    , Memory Bandwidth Allocation support
+0x80000020,         0,  ebx,       2,    smba                   , Slow Memory Bandwidth Allocation support
+0x80000020,         0,  ebx,       3,    bmec                   , Bandwidth Monitoring Event Configuration support
+0x80000020,         0,  ebx,       4,    l3rr                   , L3 Range Reservation support
+0x80000020,         1,  eax,    31:0,    mba_limit_len          , MBA enforcement limit size
+0x80000020,         1,  edx,    31:0,    mba_cos_max            , MBA max Class of Service number (zero-based)
+0x80000020,         2,  eax,    31:0,    smba_limit_len         , SMBA enforcement limit size
+0x80000020,         2,  edx,    31:0,    smba_cos_max           , SMBA max Class of Service number (zero-based)
+0x80000020,         3,  ebx,     7:0,    bmec_num_events        , BMEC number of bandwidth events available
+0x80000020,         3,  ecx,       0,    bmec_local_reads       , Local NUMA reads can be tracked
+0x80000020,         3,  ecx,       1,    bmec_remote_reads      , Remote NUMA reads can be tracked
+0x80000020,         3,  ecx,       2,    bmec_local_nontemp_wr  , Local NUMA non-temporal writes can be tracked
+0x80000020,         3,  ecx,       3,    bmec_remote_nontemp_wr , Remote NUMA non-temporal writes can be tracked
+0x80000020,         3,  ecx,       4,    bmec_local_slow_mem_rd , Local NUMA slow-memory reads can be tracked
+0x80000020,         3,  ecx,       5,    bmec_remote_slow_mem_rd, Remote NUMA slow-memory reads can be tracked
+0x80000020,         3,  ecx,       6,    bmec_all_dirty_victims , Dirty QoS victims to all types of memory can be tracked
+
+# Leaf 80000021H
+# AMD extended features enumeration 2
+
+0x80000021,         0,  eax,       0,    no_nested_data_bp      , No nested data breakpoints
+0x80000021,         0,  eax,       1,    fsgs_non_serializing   , WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing
+0x80000021,         0,  eax,       2,    lfence_rdtsc           , LFENCE always serializing / synchronizes RDTSC
+0x80000021,         0,  eax,       3,    smm_page_cfg_lock      , SMM paging configuration lock is supported
+0x80000021,         0,  eax,       6,    null_sel_clr_base      , Null selector clears base
+0x80000021,         0,  eax,       7,    upper_addr_ignore      , EFER MSR Upper Address Ignore Enable bit supported
+0x80000021,         0,  eax,       8,    autoibrs               , EFER MSR Automatic IBRS enable bit supported
+0x80000021,         0,  eax,       9,    no_smm_ctl_msr         , SMM_CTL MSR (0xc0010116) is not present
+0x80000021,         0,  eax,      10,    fsrs_supported         , Fast Short Rep Stosb (FSRS) is supported
+0x80000021,         0,  eax,      11,    fsrc_supported         , Fast Short Repe Cmpsb (FSRC) is supported
+0x80000021,         0,  eax,      13,    prefetch_ctl_msr       , Prefetch control MSR is supported
+0x80000021,         0,  eax,      17,    user_cpuid_disable     , #GP when executing CPUID at CPL > 0 is supported
+0x80000021,         0,  eax,      18,    epsf_supported         , Enhanced Predictive Store Forwarding (EPSF) is supported
+0x80000021,         0,  ebx,    11:0,    microcode_patch_size   , Size of microcode patch, in 16-byte units
+
+# Leaf 80000022H
+# AMD Performance Monitoring v2 enumeration
+
+0x80000022,         0,  eax,       0,    perfmon_v2             , Performance monitoring v2 supported
+0x80000022,         0,  eax,       1,    lbr_v2                 , Last Branch Record v2 extensions (LBR Stack)
+0x80000022,         0,  eax,       2,    lbr_pmc_freeze         , Freezing core performance counters / LBR Stack supported
+0x80000022,         0,  ebx,     3:0,    n_pmc_core             , Number of core perfomance counters
+0x80000022,         0,  ebx,     9:4,    lbr_v2_stack_size      , Number of available LBR stack entries
+0x80000022,         0,  ebx,   15:10,    n_pmc_northbridge      , Number of available northbridge (data fabric) performance counters
+0x80000022,         0,  ebx,   21:16,    n_pmc_umc              , Number of available UMC performance counters
+0x80000022,         0,  ecx,    31:0,    active_umc_bitmask     , Active UMCs bitmask
+
+# Leaf 80000023H
+# AMD Secure Multi-key Encryption enumeration
+
+0x80000023,         0,  eax,       0,    mem_hmk_mode           , MEM-HMK encryption mode is supported
+0x80000023,         0,  ebx,    15:0,    mem_hmk_avail_keys     , MEM-HMK mode: total num of available encryption keys
+
+# Leaf 80000026H
+# AMD extended topology enumeration v2
+
+0x80000026,       3:0,  eax,     4:0,    x2apic_id_shift        , Bit width of this level (previous levels inclusive)
+0x80000026,       3:0,  eax,      29,    core_has_pwreff_ranking, This core has a power efficiency ranking
+0x80000026,       3:0,  eax,      30,    domain_has_hybrid_cores, This domain level has hybrid (E, P) cores
+0x80000026,       3:0,  eax,      31,    domain_core_count_asymm, The 'Core' domain has asymmetric cores count
+0x80000026,       3:0,  ebx,    15:0,    domain_lcpus_count     , Number of logical CPUs at this domain instance
+0x80000026,       3:0,  ebx,   23:16,    core_pwreff_ranking    , This core's static power efficiency ranking
+0x80000026,       3:0,  ebx,   27:24,    core_native_model_id   , This core's native model ID
+0x80000026,       3:0,  ebx,   31:28,    core_type              , This core's type
+0x80000026,       3:0,  ecx,     7:0,    domain_level           , This domain level (subleaf ID)
+0x80000026,       3:0,  ecx,    15:8,    domain_type            , This domain type
+0x80000026,       3:0,  edx,    31:0,    x2apic_id              , x2APIC ID of current logical CPU
-- 
cgit v1.2.3


From 18ee44ce97c18ee72f5807140d07ff8cebe3cab5 Mon Sep 17 00:00:00 2001
From: Luigi Leonardi <luigi.leonardi@outlook.com>
Date: Tue, 30 Jul 2024 21:43:08 +0200
Subject: test/vsock: add ioctl unsent bytes test

Introduce two tests, one for SOCK_STREAM and one for SOCK_SEQPACKET,
which use SIOCOUTQ ioctl to check that the number of unsent bytes is
zero after delivering a packet.

vsock_connect and vsock_accept are no longer static: this is to
create more generic tests, allowing code to be reused for SEQPACKET
and STREAM.

Signed-off-by: Luigi Leonardi <luigi.leonardi@outlook.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/vsock/util.c       |  6 +--
 tools/testing/vsock/util.h       |  3 ++
 tools/testing/vsock/vsock_test.c | 85 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 554b290fefdc..a3d448a075e3 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po
 }
 
 /* Connect to <cid, port> and return the file descriptor. */
-static int vsock_connect(unsigned int cid, unsigned int port, int type)
+int vsock_connect(unsigned int cid, unsigned int port, int type)
 {
 	union {
 		struct sockaddr sa;
@@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int port, int type)
 /* Listen on <cid, port> and return the first incoming connection.  The remote
  * address is stored to clientaddrp.  clientaddrp may be NULL.
  */
-static int vsock_accept(unsigned int cid, unsigned int port,
-			struct sockaddr_vm *clientaddrp, int type)
+int vsock_accept(unsigned int cid, unsigned int port,
+		 struct sockaddr_vm *clientaddrp, int type)
 {
 	union {
 		struct sockaddr sa;
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index e95e62485959..fff22d4a14c0 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -39,6 +39,9 @@ struct test_case {
 void init_signals(void);
 unsigned int parse_cid(const char *str);
 unsigned int parse_port(const char *str);
+int vsock_connect(unsigned int cid, unsigned int port, int type);
+int vsock_accept(unsigned int cid, unsigned int port,
+		 struct sockaddr_vm *clientaddrp, int type);
 int vsock_stream_connect(unsigned int cid, unsigned int port);
 int vsock_bind_connect(unsigned int cid, unsigned int port,
 		       unsigned int bind_port, int type);
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index f851f8961247..8d38dbf8f41f 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -20,6 +20,8 @@
 #include <sys/mman.h>
 #include <poll.h>
 #include <signal.h>
+#include <sys/ioctl.h>
+#include <linux/sockios.h>
 
 #include "vsock_test_zerocopy.h"
 #include "timeout.h"
@@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct test_opts *opts)
 	}
 }
 
+#define MSG_BUF_IOCTL_LEN 64
+static void test_unsent_bytes_server(const struct test_opts *opts, int type)
+{
+	unsigned char buf[MSG_BUF_IOCTL_LEN];
+	int client_fd;
+
+	client_fd = vsock_accept(VMADDR_CID_ANY, opts->peer_port, NULL, type);
+	if (client_fd < 0) {
+		perror("accept");
+		exit(EXIT_FAILURE);
+	}
+
+	recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf));
+	control_writeln("RECEIVED");
+
+	close(client_fd);
+}
+
+static void test_unsent_bytes_client(const struct test_opts *opts, int type)
+{
+	unsigned char buf[MSG_BUF_IOCTL_LEN];
+	int ret, fd, sock_bytes_unsent;
+
+	fd = vsock_connect(opts->peer_cid, opts->peer_port, type);
+	if (fd < 0) {
+		perror("connect");
+		exit(EXIT_FAILURE);
+	}
+
+	for (int i = 0; i < sizeof(buf); i++)
+		buf[i] = rand() & 0xFF;
+
+	send_buf(fd, buf, sizeof(buf), 0, sizeof(buf));
+	control_expectln("RECEIVED");
+
+	ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent);
+	if (ret < 0) {
+		if (errno == EOPNOTSUPP) {
+			fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n");
+		} else {
+			perror("ioctl");
+			exit(EXIT_FAILURE);
+		}
+	} else if (ret == 0 && sock_bytes_unsent != 0) {
+		fprintf(stderr,
+			"Unexpected 'SIOCOUTQ' value, expected 0, got %i\n",
+			sock_bytes_unsent);
+		exit(EXIT_FAILURE);
+	}
+
+	close(fd);
+}
+
+static void test_stream_unsent_bytes_client(const struct test_opts *opts)
+{
+	test_unsent_bytes_client(opts, SOCK_STREAM);
+}
+
+static void test_stream_unsent_bytes_server(const struct test_opts *opts)
+{
+	test_unsent_bytes_server(opts, SOCK_STREAM);
+}
+
+static void test_seqpacket_unsent_bytes_client(const struct test_opts *opts)
+{
+	test_unsent_bytes_client(opts, SOCK_SEQPACKET);
+}
+
+static void test_seqpacket_unsent_bytes_server(const struct test_opts *opts)
+{
+	test_unsent_bytes_server(opts, SOCK_SEQPACKET);
+}
+
 #define RCVLOWAT_CREDIT_UPD_BUF_SIZE	(1024 * 128)
 /* This define is the same as in 'include/linux/virtio_vsock.h':
  * it is used to decide when to send credit update message during
@@ -1523,6 +1598,16 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_rcvlowat_def_cred_upd_client,
 		.run_server = test_stream_cred_upd_on_low_rx_bytes,
 	},
+	{
+		.name = "SOCK_STREAM ioctl(SIOCOUTQ) 0 unsent bytes",
+		.run_client = test_stream_unsent_bytes_client,
+		.run_server = test_stream_unsent_bytes_server,
+	},
+	{
+		.name = "SOCK_SEQPACKET ioctl(SIOCOUTQ) 0 unsent bytes",
+		.run_client = test_seqpacket_unsent_bytes_client,
+		.run_server = test_seqpacket_unsent_bytes_server,
+	},
 	{},
 };
 
-- 
cgit v1.2.3


From db61e6a4eee5a7884b2cafeaf407895f253bbaa7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <olsajiri@gmail.com>
Date: Thu, 1 Aug 2024 15:27:24 +0200
Subject: selftests/bpf: fix uprobe.path leak in bpf_testmod

testmod_unregister_uprobe() forgets to path_put(&uprobe.path).

Signed-off-by: Jiri Olsa <olsajiri@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240801132724.GA8791@redhat.com
---
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index fd28c1157bd3..72f565af4f82 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -477,6 +477,7 @@ static void testmod_unregister_uprobe(void)
 	if (uprobe.offset) {
 		uprobe_unregister(d_real_inode(uprobe.path.dentry),
 				  uprobe.offset, &uprobe.consumer);
+		path_put(&uprobe.path);
 		uprobe.offset = 0;
 	}
 
-- 
cgit v1.2.3


From e04332ebc8ac128fa551e83f1161ab1c094d13a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 1 Aug 2024 15:27:28 +0200
Subject: uprobes: kill uprobe_register_refctr()

It doesn't make any sense to have 2 versions of _register(). Note that
trace_uprobe_enable(), the only user of uprobe_register(), doesn't need
to check tu->ref_ctr_offset to decide which one should be used, it could
safely pass ref_ctr_offset == 0 to uprobe_register_refctr().

Add this argument to uprobe_register(), update the callers, and kill
uprobe_register_refctr().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240801132728.GA8800@redhat.com
---
 include/linux/uprobes.h                            |  9 ++------
 kernel/events/uprobes.c                            | 24 ++++++----------------
 kernel/trace/bpf_trace.c                           |  8 ++++----
 kernel/trace/trace_uprobe.c                        |  7 +------
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  4 ++--
 5 files changed, 15 insertions(+), 37 deletions(-)

(limited to 'tools')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index a270a5892ab4..788813c0b8fc 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -112,8 +112,7 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
 extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
-extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
-extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
+extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
@@ -154,11 +153,7 @@ static inline void uprobes_init(void)
 #define uprobe_get_trap_addr(regs)	instruction_pointer(regs)
 
 static inline int
-uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
-	return -ENOSYS;
-}
-static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
 {
 	return -ENOSYS;
 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e9b092acc71b..3a80154bc4c7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1121,25 +1121,26 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 EXPORT_SYMBOL_GPL(uprobe_unregister);
 
 /*
- * __uprobe_register - register a probe
+ * uprobe_register - register a probe
  * @inode: the file in which the probe has to be placed.
  * @offset: offset from the start of the file.
+ * @ref_ctr_offset: offset of SDT marker / reference counter
  * @uc: information on howto handle the probe..
  *
- * Apart from the access refcount, __uprobe_register() takes a creation
+ * Apart from the access refcount, uprobe_register() takes a creation
  * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
  * inserted into the rbtree (i.e first consumer for a @inode:@offset
  * tuple).  Creation refcount stops uprobe_unregister from freeing the
  * @uprobe even before the register operation is complete. Creation
  * refcount is released when the last @uc for the @uprobe
- * unregisters. Caller of __uprobe_register() is required to keep @inode
+ * unregisters. Caller of uprobe_register() is required to keep @inode
  * (and the containing mount) referenced.
  *
  * Return errno if it cannot successully install probes
  * else return 0 (success)
  */
-static int __uprobe_register(struct inode *inode, loff_t offset,
-			     loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset,
+		    struct uprobe_consumer *uc)
 {
 	struct uprobe *uprobe;
 	int ret;
@@ -1189,21 +1190,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
 		goto retry;
 	return ret;
 }
-
-int uprobe_register(struct inode *inode, loff_t offset,
-		    struct uprobe_consumer *uc)
-{
-	return __uprobe_register(inode, offset, 0, uc);
-}
 EXPORT_SYMBOL_GPL(uprobe_register);
 
-int uprobe_register_refctr(struct inode *inode, loff_t offset,
-			   loff_t ref_ctr_offset, struct uprobe_consumer *uc)
-{
-	return __uprobe_register(inode, offset, ref_ctr_offset, uc);
-}
-EXPORT_SYMBOL_GPL(uprobe_register_refctr);
-
 /*
  * uprobe_apply - unregister an already registered probe.
  * @inode: the file in which the probe has to be removed.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cd098846e251..afa909e17824 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3480,10 +3480,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		      &bpf_uprobe_multi_link_lops, prog);
 
 	for (i = 0; i < cnt; i++) {
-		err = uprobe_register_refctr(d_real_inode(link->path.dentry),
-					     uprobes[i].offset,
-					     uprobes[i].ref_ctr_offset,
-					     &uprobes[i].consumer);
+		err = uprobe_register(d_real_inode(link->path.dentry),
+				      uprobes[i].offset,
+				      uprobes[i].ref_ctr_offset,
+				      &uprobes[i].consumer);
 		if (err) {
 			bpf_uprobe_unregister(&path, uprobes, i);
 			goto error_free;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c98e3b3386ba..1f590f989c1e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1089,12 +1089,7 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
 	tu->consumer.filter = filter;
 	tu->inode = d_real_inode(tu->path.dentry);
 
-	if (tu->ref_ctr_offset)
-		ret = uprobe_register_refctr(tu->inode, tu->offset,
-				tu->ref_ctr_offset, &tu->consumer);
-	else
-		ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
-
+	ret = uprobe_register(tu->inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
 	if (ret)
 		tu->inode = NULL;
 
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 72f565af4f82..55f6905de743 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -458,8 +458,8 @@ static int testmod_register_uprobe(loff_t offset)
 	if (err)
 		goto out;
 
-	err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry),
-				     offset, 0, &uprobe.consumer);
+	err = uprobe_register(d_real_inode(uprobe.path.dentry),
+			      offset, 0, &uprobe.consumer);
 	if (err)
 		path_put(&uprobe.path);
 	else
-- 
cgit v1.2.3


From 3c83a9ad0295eb63bdeb81d821b8c3b9417fbcac Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 1 Aug 2024 15:27:34 +0200
Subject: uprobes: make uprobe_register() return struct uprobe *

This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *"
rather than inode + offset. This simplifies the code and allows to avoid
the unnecessary find_uprobe() + put_uprobe() in these functions.

TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that
this uprobe can't be freed before up_write(&uprobe->register_rwsem).

Co-developed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240801132734.GA8803@redhat.com
---
 include/linux/uprobes.h                            | 15 +++---
 kernel/events/uprobes.c                            | 56 +++++++++-------------
 kernel/trace/bpf_trace.c                           | 25 +++++-----
 kernel/trace/trace_uprobe.c                        | 26 +++++-----
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        | 25 +++++-----
 5 files changed, 67 insertions(+), 80 deletions(-)

(limited to 'tools')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 788813c0b8fc..f50df1fa93e7 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 
+struct uprobe;
 struct vm_area_struct;
 struct mm_struct;
 struct inode;
@@ -112,9 +113,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
 extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
-extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
-extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
+extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
+extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
 extern void uprobe_start_dup_mmap(void);
@@ -152,18 +153,18 @@ static inline void uprobes_init(void)
 
 #define uprobe_get_trap_addr(regs)	instruction_pointer(regs)
 
-static inline int
+static inline struct uprobe *
 uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
 {
-	return -ENOSYS;
+	return ERR_PTR(-ENOSYS);
 }
 static inline int
-uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
+uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
 {
 	return -ENOSYS;
 }
 static inline void
-uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 }
 static inline int uprobe_mmap(struct vm_area_struct *vma)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3a80154bc4c7..b33f1397dae0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1099,20 +1099,14 @@ __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 		delete_uprobe(uprobe);
 }
 
-/*
+/**
  * uprobe_unregister - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
+ * @uprobe: uprobe to remove
  * @uc: identify which probe if multiple probes are colocated.
  */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-	struct uprobe *uprobe;
-
-	uprobe = find_uprobe(inode, offset);
-	if (WARN_ON(!uprobe))
-		return;
-
+	get_uprobe(uprobe);
 	down_write(&uprobe->register_rwsem);
 	__uprobe_unregister(uprobe, uc);
 	up_write(&uprobe->register_rwsem);
@@ -1120,7 +1114,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 }
 EXPORT_SYMBOL_GPL(uprobe_unregister);
 
-/*
+/**
  * uprobe_register - register a probe
  * @inode: the file in which the probe has to be placed.
  * @offset: offset from the start of the file.
@@ -1136,40 +1130,40 @@ EXPORT_SYMBOL_GPL(uprobe_unregister);
  * unregisters. Caller of uprobe_register() is required to keep @inode
  * (and the containing mount) referenced.
  *
- * Return errno if it cannot successully install probes
- * else return 0 (success)
+ * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
  */
-int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset,
-		    struct uprobe_consumer *uc)
+struct uprobe *uprobe_register(struct inode *inode,
+				loff_t offset, loff_t ref_ctr_offset,
+				struct uprobe_consumer *uc)
 {
 	struct uprobe *uprobe;
 	int ret;
 
 	/* Uprobe must have at least one set consumer */
 	if (!uc->handler && !uc->ret_handler)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
 	if (!inode->i_mapping->a_ops->read_folio &&
 	    !shmem_mapping(inode->i_mapping))
-		return -EIO;
+		return ERR_PTR(-EIO);
 	/* Racy, just to catch the obvious mistakes */
 	if (offset > i_size_read(inode))
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	/*
 	 * This ensures that copy_from_page(), copy_to_page() and
 	 * __update_ref_ctr() can't cross page boundary.
 	 */
 	if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
  retry:
 	uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
 	if (IS_ERR(uprobe))
-		return PTR_ERR(uprobe);
+		return uprobe;
 
 	/*
 	 * We can race with uprobe_unregister()->delete_uprobe().
@@ -1188,35 +1182,29 @@ int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset,
 
 	if (unlikely(ret == -EAGAIN))
 		goto retry;
-	return ret;
+
+	return ret ? ERR_PTR(ret) : uprobe;
 }
 EXPORT_SYMBOL_GPL(uprobe_register);
 
-/*
- * uprobe_apply - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
+/**
+ * uprobe_apply - add or remove the breakpoints according to @uc->filter
+ * @uprobe: uprobe which "owns" the breakpoint
  * @uc: consumer which wants to add more or remove some breakpoints
  * @add: add or remove the breakpoints
+ * Return: 0 on success or negative error code.
  */
-int uprobe_apply(struct inode *inode, loff_t offset,
-			struct uprobe_consumer *uc, bool add)
+int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
 {
-	struct uprobe *uprobe;
 	struct uprobe_consumer *con;
 	int ret = -ENOENT;
 
-	uprobe = find_uprobe(inode, offset);
-	if (WARN_ON(!uprobe))
-		return ret;
-
 	down_write(&uprobe->register_rwsem);
 	for (con = uprobe->consumers; con && con != uc ; con = con->next)
 		;
 	if (con)
 		ret = register_for_each_vma(uprobe, add ? uc : NULL);
 	up_write(&uprobe->register_rwsem);
-	put_uprobe(uprobe);
 
 	return ret;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index afa909e17824..4e391daafa64 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3160,6 +3160,7 @@ struct bpf_uprobe {
 	loff_t offset;
 	unsigned long ref_ctr_offset;
 	u64 cookie;
+	struct uprobe *uprobe;
 	struct uprobe_consumer consumer;
 };
 
@@ -3178,15 +3179,12 @@ struct bpf_uprobe_multi_run_ctx {
 	struct bpf_uprobe *uprobe;
 };
 
-static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
-				  u32 cnt)
+static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
 {
 	u32 i;
 
-	for (i = 0; i < cnt; i++) {
-		uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
-				  &uprobes[i].consumer);
-	}
+	for (i = 0; i < cnt; i++)
+		uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer);
 }
 
 static void bpf_uprobe_multi_link_release(struct bpf_link *link)
@@ -3194,7 +3192,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
 	struct bpf_uprobe_multi_link *umulti_link;
 
 	umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
-	bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+	bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
 	if (umulti_link->task)
 		put_task_struct(umulti_link->task);
 	path_put(&umulti_link->path);
@@ -3480,12 +3478,13 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		      &bpf_uprobe_multi_link_lops, prog);
 
 	for (i = 0; i < cnt; i++) {
-		err = uprobe_register(d_real_inode(link->path.dentry),
-				      uprobes[i].offset,
-				      uprobes[i].ref_ctr_offset,
-				      &uprobes[i].consumer);
-		if (err) {
-			bpf_uprobe_unregister(&path, uprobes, i);
+		uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
+						    uprobes[i].offset,
+						    uprobes[i].ref_ctr_offset,
+						    &uprobes[i].consumer);
+		if (IS_ERR(uprobes[i].uprobe)) {
+			err = PTR_ERR(uprobes[i].uprobe);
+			bpf_uprobe_unregister(uprobes, i);
 			goto error_free;
 		}
 	}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 1f590f989c1e..52e76a73fa7c 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -58,8 +58,8 @@ struct trace_uprobe {
 	struct dyn_event		devent;
 	struct uprobe_consumer		consumer;
 	struct path			path;
-	struct inode			*inode;
 	char				*filename;
+	struct uprobe			*uprobe;
 	unsigned long			offset;
 	unsigned long			ref_ctr_offset;
 	unsigned long			nhit;
@@ -1084,16 +1084,16 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
 
 static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
 {
-	int ret;
+	struct inode *inode = d_real_inode(tu->path.dentry);
+	struct uprobe *uprobe;
 
 	tu->consumer.filter = filter;
-	tu->inode = d_real_inode(tu->path.dentry);
-
-	ret = uprobe_register(tu->inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
-	if (ret)
-		tu->inode = NULL;
+	uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
+	if (IS_ERR(uprobe))
+		return PTR_ERR(uprobe);
 
-	return ret;
+	tu->uprobe = uprobe;
+	return 0;
 }
 
 static void __probe_event_disable(struct trace_probe *tp)
@@ -1104,11 +1104,11 @@ static void __probe_event_disable(struct trace_probe *tp)
 	WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
 
 	list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
-		if (!tu->inode)
+		if (!tu->uprobe)
 			continue;
 
-		uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
-		tu->inode = NULL;
+		uprobe_unregister(tu->uprobe, &tu->consumer);
+		tu->uprobe = NULL;
 	}
 }
 
@@ -1305,7 +1305,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
 		return 0;
 
 	list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
-		ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+		ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
 		if (ret)
 			break;
 	}
@@ -1329,7 +1329,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
 		return 0;
 
 	list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
-		err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+		err = uprobe_apply(tu->uprobe, &tu->consumer, true);
 		if (err) {
 			uprobe_perf_close(call, event);
 			break;
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 55f6905de743..3c0515a27842 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -432,7 +432,7 @@ uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
 
 struct testmod_uprobe {
 	struct path path;
-	loff_t offset;
+	struct uprobe *uprobe;
 	struct uprobe_consumer consumer;
 };
 
@@ -446,25 +446,25 @@ static int testmod_register_uprobe(loff_t offset)
 {
 	int err = -EBUSY;
 
-	if (uprobe.offset)
+	if (uprobe.uprobe)
 		return -EBUSY;
 
 	mutex_lock(&testmod_uprobe_mutex);
 
-	if (uprobe.offset)
+	if (uprobe.uprobe)
 		goto out;
 
 	err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path);
 	if (err)
 		goto out;
 
-	err = uprobe_register(d_real_inode(uprobe.path.dentry),
-			      offset, 0, &uprobe.consumer);
-	if (err)
+	uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
+					offset, 0, &uprobe.consumer);
+	if (IS_ERR(uprobe.uprobe)) {
+		err = PTR_ERR(uprobe.uprobe);
 		path_put(&uprobe.path);
-	else
-		uprobe.offset = offset;
-
+		uprobe.uprobe = NULL;
+	}
 out:
 	mutex_unlock(&testmod_uprobe_mutex);
 	return err;
@@ -474,11 +474,10 @@ static void testmod_unregister_uprobe(void)
 {
 	mutex_lock(&testmod_uprobe_mutex);
 
-	if (uprobe.offset) {
-		uprobe_unregister(d_real_inode(uprobe.path.dentry),
-				  uprobe.offset, &uprobe.consumer);
+	if (uprobe.uprobe) {
+		uprobe_unregister(uprobe.uprobe, &uprobe.consumer);
 		path_put(&uprobe.path);
-		uprobe.offset = 0;
+		uprobe.uprobe = NULL;
 	}
 
 	mutex_unlock(&testmod_uprobe_mutex);
-- 
cgit v1.2.3


From 6998a73efbb8a87f4dd0bddde90b7f5b0d47b5e0 Mon Sep 17 00:00:00 2001
From: Keith Lucas <keith.lucas@oracle.com>
Date: Fri, 2 Aug 2024 06:13:18 +0000
Subject: selftests/mm: Add new testcases for pkeys

Add a few new tests to exercise the signal handler flow, especially with
PKEY 0 disabled:

 - Verify that the SIGSEGV handler is invoked when pkey 0 is disabled.

 - Verify that a thread which disables PKEY 0 segfaults with PKUERR when
   accessing the stack.

 - Verify that the SIGSEGV handler that uses an alternate signal stack is
   correctly invoked when the thread disabled PKEY 0

 - Verify that the PKRU value set by the application is correctly restored
   upon return from signal handling.

 - Verify that sigreturn() is able to restore the altstack even if the
   thread had PKEY 0 disabled

[ Aruna: Adapted to upstream ]
[ tglx: Made it actually compile. Restored protection_keys compile. Added
  	useful info to the changelog instead of bare function names. ]

Signed-off-by: Keith Lucas <keith.lucas@oracle.com>
Signed-off-by: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240802061318.2140081-6-aruna.ramakrishna@oracle.com
---
 tools/testing/selftests/mm/Makefile                |   1 +
 tools/testing/selftests/mm/pkey-helpers.h          |  13 +-
 tools/testing/selftests/mm/pkey_sighandler_tests.c | 481 +++++++++++++++++++++
 tools/testing/selftests/mm/protection_keys.c       |  10 -
 4 files changed, 494 insertions(+), 11 deletions(-)
 create mode 100644 tools/testing/selftests/mm/pkey_sighandler_tests.c

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 901e0d07765b..1f176fff7054 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -88,6 +88,7 @@ CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_pr
 CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
 
 VMTARGETS := protection_keys
+VMTARGETS += pkey_sighandler_tests
 BINARIES_32 := $(VMTARGETS:%=%_32)
 BINARIES_64 := $(VMTARGETS:%=%_64)
 
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 1af3156a9db8..4d31a309a46b 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -79,7 +79,18 @@ extern void abort_hooks(void);
 	}					\
 } while (0)
 
-__attribute__((noinline)) int read_ptr(int *ptr);
+#define barrier() __asm__ __volatile__("": : :"memory")
+#ifndef noinline
+# define noinline __attribute__((noinline))
+#endif
+
+noinline int read_ptr(int *ptr)
+{
+	/* Keep GCC from optimizing this away somehow */
+	barrier();
+	return *ptr;
+}
+
 void expected_pkey_fault(int pkey);
 int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 int sys_pkey_free(unsigned long pkey);
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
new file mode 100644
index 000000000000..a8088b645ad6
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
+ *
+ * The testcases in this file exercise various flows related to signal handling,
+ * using an alternate signal stack, with the default pkey (pkey 0) disabled.
+ *
+ * Compile with:
+ * gcc -mxsave      -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm
+ * gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <errno.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <limits.h>
+
+#include "pkey-helpers.h"
+
+#define STACK_SIZE PTHREAD_STACK_MIN
+
+void expected_pkey_fault(int pkey) {}
+
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+siginfo_t siginfo = {0};
+
+/*
+ * We need to use inline assembly instead of glibc's syscall because glibc's
+ * syscall will attempt to access the PLT in order to call a library function
+ * which is protected by MPK 0 which we don't have access to.
+ */
+static inline __always_inline
+long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
+{
+	unsigned long ret;
+#ifdef __x86_64__
+	register long r10 asm("r10") = a4;
+	register long r8 asm("r8") = a5;
+	register long r9 asm("r9") = a6;
+	asm volatile ("syscall"
+		      : "=a"(ret)
+		      : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9)
+		      : "rcx", "r11", "memory");
+#elif defined __i386__
+	asm volatile ("int $0x80"
+		      : "=a"(ret)
+		      : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5)
+		      : "memory");
+#else
+# error syscall_raw() not implemented
+#endif
+	return ret;
+}
+
+static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext)
+{
+	pthread_mutex_lock(&mutex);
+
+	memcpy(&siginfo, info, sizeof(siginfo_t));
+
+	pthread_cond_signal(&cond);
+	pthread_mutex_unlock(&mutex);
+
+	syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+}
+
+static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext)
+{
+	pthread_mutex_lock(&mutex);
+
+	memcpy(&siginfo, info, sizeof(siginfo_t));
+
+	pthread_cond_signal(&cond);
+	pthread_mutex_unlock(&mutex);
+}
+
+static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext)
+{
+	/*
+	 * pkru should be the init_pkru value which enabled MPK 0 so
+	 * we can use library functions.
+	 */
+	printf("%s invoked.\n", __func__);
+}
+
+static void raise_sigusr2(void)
+{
+	pid_t tid = 0;
+
+	tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0);
+
+	syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0);
+
+	/*
+	 * We should return from the signal handler here and be able to
+	 * return to the interrupted thread.
+	 */
+}
+
+static void *thread_segv_with_pkey0_disabled(void *ptr)
+{
+	/* Disable MPK 0 (and all others too) */
+	__write_pkey_reg(0x55555555);
+
+	/* Segfault (with SEGV_MAPERR) */
+	*(int *) (0x1) = 1;
+	return NULL;
+}
+
+static void *thread_segv_pkuerr_stack(void *ptr)
+{
+	/* Disable MPK 0 (and all others too) */
+	__write_pkey_reg(0x55555555);
+
+	/* After we disable MPK 0, we can't access the stack to return */
+	return NULL;
+}
+
+static void *thread_segv_maperr_ptr(void *ptr)
+{
+	stack_t *stack = ptr;
+	int *bad = (int *)1;
+
+	/*
+	 * Setup alternate signal stack, which should be pkey_mprotect()ed by
+	 * MPK 0. The thread's stack cannot be used for signals because it is
+	 * not accessible by the default init_pkru value of 0x55555554.
+	 */
+	syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
+
+	/* Disable MPK 0.  Only MPK 1 is enabled. */
+	__write_pkey_reg(0x55555551);
+
+	/* Segfault */
+	*bad = 1;
+	syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+	return NULL;
+}
+
+/*
+ * Verify that the sigsegv handler is invoked when pkey 0 is disabled.
+ * Note that the new thread stack and the alternate signal stack is
+ * protected by MPK 0.
+ */
+static void test_sigsegv_handler_with_pkey0_disabled(void)
+{
+	struct sigaction sa;
+	pthread_attr_t attr;
+	pthread_t thr;
+
+	sa.sa_flags = SA_SIGINFO;
+
+	sa.sa_sigaction = sigsegv_handler;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+		perror("sigaction");
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&siginfo, 0, sizeof(siginfo));
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+	pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL);
+
+	pthread_mutex_lock(&mutex);
+	while (siginfo.si_signo == 0)
+		pthread_cond_wait(&cond, &mutex);
+	pthread_mutex_unlock(&mutex);
+
+	ksft_test_result(siginfo.si_signo == SIGSEGV &&
+			 siginfo.si_code == SEGV_MAPERR &&
+			 siginfo.si_addr == (void *)1,
+			 "%s\n", __func__);
+}
+
+/*
+ * Verify that the sigsegv handler is invoked when pkey 0 is disabled.
+ * Note that the new thread stack and the alternate signal stack is
+ * protected by MPK 0, which renders them inaccessible when MPK 0
+ * is disabled. So just the return from the thread should cause a
+ * segfault with SEGV_PKUERR.
+ */
+static void test_sigsegv_handler_cannot_access_stack(void)
+{
+	struct sigaction sa;
+	pthread_attr_t attr;
+	pthread_t thr;
+
+	sa.sa_flags = SA_SIGINFO;
+
+	sa.sa_sigaction = sigsegv_handler;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+		perror("sigaction");
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&siginfo, 0, sizeof(siginfo));
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+	pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL);
+
+	pthread_mutex_lock(&mutex);
+	while (siginfo.si_signo == 0)
+		pthread_cond_wait(&cond, &mutex);
+	pthread_mutex_unlock(&mutex);
+
+	ksft_test_result(siginfo.si_signo == SIGSEGV &&
+			 siginfo.si_code == SEGV_PKUERR,
+			 "%s\n", __func__);
+}
+
+/*
+ * Verify that the sigsegv handler that uses an alternate signal stack
+ * is correctly invoked for a thread which uses a non-zero MPK to protect
+ * its own stack, and disables all other MPKs (including 0).
+ */
+static void test_sigsegv_handler_with_different_pkey_for_stack(void)
+{
+	struct sigaction sa;
+	static stack_t sigstack;
+	void *stack;
+	int pkey;
+	int parent_pid = 0;
+	int child_pid = 0;
+
+	sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+
+	sa.sa_sigaction = sigsegv_handler;
+
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+		perror("sigaction");
+		exit(EXIT_FAILURE);
+	}
+
+	stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	assert(stack != MAP_FAILED);
+
+	/* Allow access to MPK 0 and MPK 1 */
+	__write_pkey_reg(0x55555550);
+
+	/* Protect the new stack with MPK 1 */
+	pkey = pkey_alloc(0, 0);
+	pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
+
+	/* Set up alternate signal stack that will use the default MPK */
+	sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
+			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	sigstack.ss_flags = 0;
+	sigstack.ss_size = STACK_SIZE;
+
+	memset(&siginfo, 0, sizeof(siginfo));
+
+	/* Use clone to avoid newer glibcs using rseq on new threads */
+	long ret = syscall_raw(SYS_clone,
+			       CLONE_VM | CLONE_FS | CLONE_FILES |
+			       CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+			       CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+			       CLONE_DETACHED,
+			       (long) ((char *)(stack) + STACK_SIZE),
+			       (long) &parent_pid,
+			       (long) &child_pid, 0, 0);
+
+	if (ret < 0) {
+		errno = -ret;
+		perror("clone");
+	} else if (ret == 0) {
+		thread_segv_maperr_ptr(&sigstack);
+		syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+	}
+
+	pthread_mutex_lock(&mutex);
+	while (siginfo.si_signo == 0)
+		pthread_cond_wait(&cond, &mutex);
+	pthread_mutex_unlock(&mutex);
+
+	ksft_test_result(siginfo.si_signo == SIGSEGV &&
+			 siginfo.si_code == SEGV_MAPERR &&
+			 siginfo.si_addr == (void *)1,
+			 "%s\n", __func__);
+}
+
+/*
+ * Verify that the PKRU value set by the application is correctly
+ * restored upon return from signal handling.
+ */
+static void test_pkru_preserved_after_sigusr1(void)
+{
+	struct sigaction sa;
+	unsigned long pkru = 0x45454544;
+
+	sa.sa_flags = SA_SIGINFO;
+
+	sa.sa_sigaction = sigusr1_handler;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+		perror("sigaction");
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&siginfo, 0, sizeof(siginfo));
+
+	__write_pkey_reg(pkru);
+
+	raise(SIGUSR1);
+
+	pthread_mutex_lock(&mutex);
+	while (siginfo.si_signo == 0)
+		pthread_cond_wait(&cond, &mutex);
+	pthread_mutex_unlock(&mutex);
+
+	/* Ensure the pkru value is the same after returning from signal. */
+	ksft_test_result(pkru == __read_pkey_reg() &&
+			 siginfo.si_signo == SIGUSR1,
+			 "%s\n", __func__);
+}
+
+static noinline void *thread_sigusr2_self(void *ptr)
+{
+	/*
+	 * A const char array like "Resuming after SIGUSR2" won't be stored on
+	 * the stack and the code could access it via an offset from the program
+	 * counter. This makes sure it's on the function's stack frame.
+	 */
+	char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ',
+		'a', 'f', 't', 'e', 'r', ' ',
+		'S', 'I', 'G', 'U', 'S', 'R', '2',
+		'.', '.', '.', '\n', '\0'};
+	stack_t *stack = ptr;
+
+	/*
+	 * Setup alternate signal stack, which should be pkey_mprotect()ed by
+	 * MPK 0. The thread's stack cannot be used for signals because it is
+	 * not accessible by the default init_pkru value of 0x55555554.
+	 */
+	syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
+
+	/* Disable MPK 0.  Only MPK 2 is enabled. */
+	__write_pkey_reg(0x55555545);
+
+	raise_sigusr2();
+
+	/* Do something, to show the thread resumed execution after the signal */
+	syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0);
+
+	/*
+	 * We can't return to test_pkru_sigreturn because it
+	 * will attempt to use a %rbp value which is on the stack
+	 * of the main thread.
+	 */
+	syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+	return NULL;
+}
+
+/*
+ * Verify that sigreturn is able to restore altstack even if the thread had
+ * disabled pkey 0.
+ */
+static void test_pkru_sigreturn(void)
+{
+	struct sigaction sa = {0};
+	static stack_t sigstack;
+	void *stack;
+	int pkey;
+	int parent_pid = 0;
+	int child_pid = 0;
+
+	sa.sa_handler = SIG_DFL;
+	sa.sa_flags = 0;
+	sigemptyset(&sa.sa_mask);
+
+	/*
+	 * For this testcase, we do not want to handle SIGSEGV. Reset handler
+	 * to default so that the application can crash if it receives SIGSEGV.
+	 */
+	if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+		perror("sigaction");
+		exit(EXIT_FAILURE);
+	}
+
+	sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+	sa.sa_sigaction = sigusr2_handler;
+	sigemptyset(&sa.sa_mask);
+
+	if (sigaction(SIGUSR2, &sa, NULL) == -1) {
+		perror("sigaction");
+		exit(EXIT_FAILURE);
+	}
+
+	stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	assert(stack != MAP_FAILED);
+
+	/*
+	 * Allow access to MPK 0 and MPK 2. The child thread (to be created
+	 * later in this flow) will have its stack protected by MPK 2, whereas
+	 * the current thread's stack is protected by the default MPK 0. Hence
+	 * both need to be enabled.
+	 */
+	__write_pkey_reg(0x55555544);
+
+	/* Protect the stack with MPK 2 */
+	pkey = pkey_alloc(0, 0);
+	pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
+
+	/* Set up alternate signal stack that will use the default MPK */
+	sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
+			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	sigstack.ss_flags = 0;
+	sigstack.ss_size = STACK_SIZE;
+
+	/* Use clone to avoid newer glibcs using rseq on new threads */
+	long ret = syscall_raw(SYS_clone,
+			       CLONE_VM | CLONE_FS | CLONE_FILES |
+			       CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+			       CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+			       CLONE_DETACHED,
+			       (long) ((char *)(stack) + STACK_SIZE),
+			       (long) &parent_pid,
+			       (long) &child_pid, 0, 0);
+
+	if (ret < 0) {
+		errno = -ret;
+		perror("clone");
+	}  else if (ret == 0) {
+		thread_sigusr2_self(&sigstack);
+		syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+	}
+
+	child_pid =  ret;
+	/* Check that thread exited */
+	do {
+		sched_yield();
+		ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0);
+	} while (ret != -ESRCH && ret != -EINVAL);
+
+	ksft_test_result_pass("%s\n", __func__);
+}
+
+static void (*pkey_tests[])(void) = {
+	test_sigsegv_handler_with_pkey0_disabled,
+	test_sigsegv_handler_cannot_access_stack,
+	test_sigsegv_handler_with_different_pkey_for_stack,
+	test_pkru_preserved_after_sigusr1,
+	test_pkru_sigreturn
+};
+
+int main(int argc, char *argv[])
+{
+	int i;
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(pkey_tests));
+
+	for (i = 0; i < ARRAY_SIZE(pkey_tests); i++)
+		(*pkey_tests[i])();
+
+	ksft_finished();
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index eaa6d1fc5328..cc6de1644360 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -950,16 +950,6 @@ void close_test_fds(void)
 	nr_test_fds = 0;
 }
 
-#define barrier() __asm__ __volatile__("": : :"memory")
-__attribute__((noinline)) int read_ptr(int *ptr)
-{
-	/*
-	 * Keep GCC from optimizing this away somehow
-	 */
-	barrier();
-	return *ptr;
-}
-
 void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
 {
 	int i, err;
-- 
cgit v1.2.3


From 3d650ab5e7d9c4d7306e4c116f8aa9980bf13295 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Fri, 2 Aug 2024 11:54:34 -0700
Subject: selftests/bpf: Fix a btf_dump selftest failure

Jakub reported bpf selftest "btf_dump" failure after forwarding to
v6.11-rc1 with netdev.
  Error: #33 btf_dump
  Error: #33/15 btf_dump/btf_dump: var_data
    btf_dump_data:FAIL:find type id unexpected find type id: actual -2 < expected 0

The reason for the failure is due to
  commit 94ede2a3e913 ("profiling: remove stale percpu flip buffer variables")
where percpu static variable "cpu_profile_flip" is removed.

Let us replace "cpu_profile_flip" with a variable in bpf subsystem
so whenever that variable gets deleted or renamed, we can detect the
failure immediately. In this case, I picked a static percpu variable
"bpf_cgrp_storage_busy" which is defined in kernel/bpf/bpf_cgrp_storage.c.

Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240802185434.1749056-1-yonghong.song@linux.dev
---
 tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 09a8e6f9b379..b293b8501fd6 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -805,8 +805,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d,
 	TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT,
 			  "int cpu_number = (int)100", 100);
 #endif
-	TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT,
-			  "static int cpu_profile_flip = (int)2", 2);
+	TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT,
+			  "static int bpf_cgrp_storage_busy = (int)2", 2);
 }
 
 static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str,
-- 
cgit v1.2.3


From ab1000976cc7de8e57bdef811dfcfcb6c17a929f Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 1 Aug 2024 17:03:07 -0700
Subject: selftests: net-drv: exercise queue stats when the device is down

Verify that total device stats don't decrease after it has been turned down.
Also make sure the device doesn't crash when we access per-queue stats
when it's down (in case it tries to access some pointers that are NULL).

  KTAP version 1
  1..5
  ok 1 stats.check_pause
  ok 2 stats.check_fec
  ok 3 stats.pkt_byte_sum
  ok 4 stats.qstat_by_ifindex
  ok 5 stats.check_down
  # Totals: pass:5 fail:0 xfail:0 xpass:0 skip:0 error:0

v3:
- use errno.EOPNOTSUPP (Petr)
- move qstat[0] under try (Petr)

v2:
- KTAP output formatting (Jakub)
- defer instead of try/finally (Jakub)
- disappearing stats is an error (Jakub)
- ksft_ge instead of open coding (Jakub)

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20240802000309.2368-1-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/stats.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py
index 820b8e0a22c6..2fdde8cf0307 100755
--- a/tools/testing/selftests/drivers/net/stats.py
+++ b/tools/testing/selftests/drivers/net/stats.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0
 
+import errno
 from lib.py import ksft_run, ksft_exit, ksft_pr
 from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx
 from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError
 from lib.py import NetDrvEnv
+from lib.py import ip, defer
 
 ethnl = EthtoolFamily()
 netfam = NetdevFamily()
@@ -133,9 +135,30 @@ def qstat_by_ifindex(cfg) -> None:
     ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
 
 
+def check_down(cfg) -> None:
+    try:
+        qstat = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
+    except NlError as e:
+        if e.error == errno.EOPNOTSUPP:
+            raise KsftSkipEx("qstats not supported by the device")
+        raise
+
+    ip(f"link set dev {cfg.dev['ifname']} down")
+    defer(ip, f"link set dev {cfg.dev['ifname']} up")
+
+    qstat2 = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
+    for k, v in qstat.items():
+        ksft_ge(qstat2[k], qstat[k], comment=f"{k} went backwards on device down")
+
+    # exercise per-queue API to make sure that "device down" state
+    # is handled correctly and doesn't crash
+    netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True)
+
+
 def main() -> None:
     with NetDrvEnv(__file__) as cfg:
-        ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex],
+        ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex,
+                  check_down],
                  args=(cfg, ))
     ksft_exit()
 
-- 
cgit v1.2.3


From f879306834818ebd1722a4372079610cdd466fec Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 1 Aug 2024 17:03:08 -0700
Subject: selftests: net: ksft: support marking tests as disruptive

Add new @ksft_disruptive decorator to mark the tests that might
be disruptive to the system. Depending on how well the previous
test works in the CI we might want to disable disruptive tests
by default and only let the developers run them manually.

KSFT framework runs disruptive tests by default. DISRUPTIVE=False
environment (or config file) can be used to disable these tests.
ksft_setup should be called by the test cases that want to use
new decorator (ksft_setup is only called via NetDrvEnv/NetDrvEpEnv for now).

In the future we can add similar decorators to, for example, avoid
running slow tests all the time. And/or have some option to run
only 'fast' tests for some sort of smoke test scenario.

  $ DISRUPTIVE=False ./stats.py
  KTAP version 1
  1..5
  ok 1 stats.check_pause
  ok 2 stats.check_fec
  ok 3 stats.pkt_byte_sum
  ok 4 stats.qstat_by_ifindex
  ok 5 stats.check_down # SKIP marked as disruptive
  # Totals: pass:4 fail:0 xfail:0 xpass:0 skip:1 error:0

v3:
- parse yes and properly treat non-zero nums as true (Petr)

v2:
- convert from cli argument to env variable (Jakub)

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20240802000309.2368-2-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/lib/py/env.py |  5 +--
 tools/testing/selftests/drivers/net/stats.py      |  2 ++
 tools/testing/selftests/net/lib/py/ksft.py        | 40 +++++++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index a5e800b8f103..1ea9bb695e94 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -4,6 +4,7 @@ import os
 import time
 from pathlib import Path
 from lib.py import KsftSkipEx, KsftXfailEx
+from lib.py import ksft_setup
 from lib.py import cmd, ethtool, ip
 from lib.py import NetNS, NetdevSimDev
 from .remote import Remote
@@ -14,7 +15,7 @@ def _load_env_file(src_path):
 
     src_dir = Path(src_path).parent.resolve()
     if not (src_dir / "net.config").exists():
-        return env
+        return ksft_setup(env)
 
     with open((src_dir / "net.config").as_posix(), 'r') as fp:
         for line in fp.readlines():
@@ -30,7 +31,7 @@ def _load_env_file(src_path):
             if len(pair) != 2:
                 raise Exception("Can't parse configuration line:", full_file)
             env[pair[0]] = pair[1]
-    return env
+    return ksft_setup(env)
 
 
 class NetDrvEnv:
diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py
index 2fdde8cf0307..d17dfed2788f 100755
--- a/tools/testing/selftests/drivers/net/stats.py
+++ b/tools/testing/selftests/drivers/net/stats.py
@@ -4,6 +4,7 @@
 import errno
 from lib.py import ksft_run, ksft_exit, ksft_pr
 from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx
+from lib.py import ksft_disruptive
 from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError
 from lib.py import NetDrvEnv
 from lib.py import ip, defer
@@ -135,6 +136,7 @@ def qstat_by_ifindex(cfg) -> None:
     ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
 
 
+@ksft_disruptive
 def check_down(cfg) -> None:
     try:
         qstat = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index f26c20df9db4..353860fe6223 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 import builtins
+import functools
 import inspect
 import sys
 import time
@@ -10,6 +11,7 @@ from .utils import global_defer_queue
 
 KSFT_RESULT = None
 KSFT_RESULT_ALL = True
+KSFT_DISRUPTIVE = True
 
 
 class KsftFailEx(Exception):
@@ -127,6 +129,44 @@ def ksft_flush_defer():
             KSFT_RESULT = False
 
 
+def ksft_disruptive(func):
+    """
+    Decorator that marks the test as disruptive (e.g. the test
+    that can down the interface). Disruptive tests can be skipped
+    by passing DISRUPTIVE=False environment variable.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not KSFT_DISRUPTIVE:
+            raise KsftSkipEx(f"marked as disruptive")
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def ksft_setup(env):
+    """
+    Setup test framework global state from the environment.
+    """
+
+    def get_bool(env, name):
+        value = env.get(name, "").lower()
+        if value in ["yes", "true"]:
+            return True
+        if value in ["no", "false"]:
+            return False
+        try:
+            return bool(int(value))
+        except:
+            raise Exception(f"failed to parse {name}")
+
+    if "DISRUPTIVE" in env:
+        global KSFT_DISRUPTIVE
+        KSFT_DISRUPTIVE = get_bool(env, "DISRUPTIVE")
+
+    return env
+
+
 def ksft_run(cases=None, globs=None, case_pfx=None, args=()):
     cases = cases or []
 
-- 
cgit v1.2.3


From a48395f22b8c8687ceb77ae3014a0eabcd4bf688 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 1 Aug 2024 17:03:09 -0700
Subject: selftests: net: ksft: replace 95 with errno.EOPNOTSUPP

Petr suggested to use errno.EOPNOTSUPP instead of hard-coded 95
in the new test case. Adjust existing ones to match this style.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20240802000309.2368-3-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py | 3 ++-
 tools/testing/selftests/drivers/net/stats.py            | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
index 026d98976c35..05b6fbb3fcdd 100755
--- a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
+++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0
 
+import errno
 import time
 import os
 from lib.py import ksft_run, ksft_exit, ksft_pr
@@ -61,7 +62,7 @@ def test_pp_alloc(cfg, netdevnl):
     try:
         stats = get_stats()
     except NlError as e:
-        if e.nl_msg.error == -95:
+        if e.nl_msg.error == -errno.EOPNOTSUPP:
             stats = {}
         else:
             raise
diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py
index d17dfed2788f..63e3c045a3b2 100755
--- a/tools/testing/selftests/drivers/net/stats.py
+++ b/tools/testing/selftests/drivers/net/stats.py
@@ -20,7 +20,7 @@ def check_pause(cfg) -> None:
     try:
         ethnl.pause_get({"header": {"dev-index": cfg.ifindex}})
     except NlError as e:
-        if e.error == 95:
+        if e.error == errno.EOPNOTSUPP:
             raise KsftXfailEx("pause not supported by the device")
         raise
 
@@ -35,7 +35,7 @@ def check_fec(cfg) -> None:
     try:
         ethnl.fec_get({"header": {"dev-index": cfg.ifindex}})
     except NlError as e:
-        if e.error == 95:
+        if e.error == errno.EOPNOTSUPP:
             raise KsftXfailEx("FEC not supported by the device")
         raise
 
@@ -120,7 +120,7 @@ def qstat_by_ifindex(cfg) -> None:
     # loopback has no stats
     with ksft_raises(NlError) as cm:
         netfam.qstats_get({"ifindex": 1}, dump=True)
-    ksft_eq(cm.exception.nl_msg.error, -95)
+    ksft_eq(cm.exception.nl_msg.error, -errno.EOPNOTSUPP)
     ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
 
     # Try to get stats for lowest unused ifindex but not 0
-- 
cgit v1.2.3


From 46619175f1b7e4f7fc5dc98547f5eca27193f8dc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 1 Aug 2024 16:23:17 -0700
Subject: selftests: net: ksft: print more of the stack for checks

Print more stack frames and the failing line when check fails.
This helps when tests use helpers to do the checks.

Before:

  # At ./ksft/drivers/net/hw/rss_ctx.py line 92:
  # Check failed 1037698 >= 396893.0 traffic on other queues:[344612, 462380, 233020, 449174, 342298]
  not ok 8 rss_ctx.test_rss_context_queue_reconfigure

After:

  # Check| At ./ksft/drivers/net/hw/rss_ctx.py, line 387, in test_rss_context_queue_reconfigure:
  # Check|     test_rss_queue_reconfigure(cfg, main_ctx=False)
  # Check| At ./ksft/drivers/net/hw/rss_ctx.py, line 230, in test_rss_queue_reconfigure:
  # Check|     _send_traffic_check(cfg, port, ctx_ref, { 'target': (0, 3),
  # Check| At ./ksft/drivers/net/hw/rss_ctx.py, line 92, in _send_traffic_check:
  # Check|     ksft_lt(sum(cnts[i] for i in params['noise']), directed / 2,
  # Check failed 1045235 >= 405823.5 traffic on other queues (context 1)':[460068, 351995, 565970, 351579, 127270]
  not ok 8 rss_ctx.test_rss_context_queue_reconfigure

Link: https://patch.msgid.link/20240801232317.545577-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/py/ksft.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index 353860fe6223..f663e0daec0d 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -34,8 +34,18 @@ def _fail(*args):
     global KSFT_RESULT
     KSFT_RESULT = False
 
-    frame = inspect.stack()[2]
-    ksft_pr("At " + frame.filename + " line " + str(frame.lineno) + ":")
+    stack = inspect.stack()
+    started = False
+    for frame in reversed(stack[2:]):
+        # Start printing from the test case function
+        if not started:
+            if frame.function == 'ksft_run':
+                started = True
+            continue
+
+        ksft_pr("Check| At " + frame.filename + ", line " + str(frame.lineno) +
+                ", in " + frame.function + ":")
+        ksft_pr("Check|     " + frame.code_context[0].strip())
     ksft_pr(*args)
 
 
-- 
cgit v1.2.3


From c6e2b45a544b45ce1a26858ded61a0dc4896d54a Mon Sep 17 00:00:00 2001
From: Anthony Nandaa <profnandaa@gmail.com>
Date: Tue, 2 Jul 2024 10:22:50 +0000
Subject: tools: hv: lsvmbus: change shebang to use python3

In many modern Linux distros, running `lsvmbus` returns the error:
```
/usr/bin/env: 'python': No such file or directory
```
because 'python' doesn't point anywhere.

Now that python2 has reached EOL as of January 1, 2020 and is no longer
maintained[1], these distros have python3 instead.

Also, the script isn't executable by default because the permissions are
set to mode 644.

Fix this by updating the shebang in the `lsvmbus` to use python3 instead
of python. Also fix the permissions to be 755 so that is executable by
default, which matches other similar scripts in `tools/hv`.

The script is also tested and verified that is compatible with
python3.

[1] https://www.python.org/doc/sunset-python-2/

Signed-off-by: Anthony Nandaa <profnandaa@gmail.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Link: https://lore.kernel.org/r/20240702102250.13935-1-profnandaa@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20240702102250.13935-1-profnandaa@gmail.com>
---
 tools/hv/lsvmbus | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 tools/hv/lsvmbus

(limited to 'tools')

diff --git a/tools/hv/lsvmbus b/tools/hv/lsvmbus
old mode 100644
new mode 100755
index 099f2c44dbed..f83698f14da2
--- a/tools/hv/lsvmbus
+++ b/tools/hv/lsvmbus
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0
 
 import os
-- 
cgit v1.2.3


From 13159a139d85fa12bbbaefcde5a647e2b9721b19 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 2 Aug 2024 11:09:13 -0700
Subject: perf mem: Update documentation for new options

Add a common options section and move some items to the section.  Also
add description of new options to report options.

Suggested-by: Ian Rogers <irogers@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/lkml/20240802180913.1023886-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-mem.txt | 94 +++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 31 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index 47456b212e99..8a1bd9ff0f86 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -28,15 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide.
 Due to the statistical nature of SPE sampling, not every memory operation will
 be sampled.
 
-OPTIONS
--------
-<command>...::
-	Any command you can specify in a shell.
-
--i::
---input=<file>::
-	Input file name.
-
+COMMON OPTIONS
+--------------
 -f::
 --force::
 	Don't do ownership validation
@@ -45,24 +38,9 @@ OPTIONS
 --type=<type>::
 	Select the memory operation type: load or store (default: load,store)
 
--D::
---dump-raw-samples::
-	Dump the raw decoded samples on the screen in a format that is easy to parse with
-	one sample per line.
-
--x::
---field-separator=<separator>::
-	Specify the field separator used when dump raw samples (-D option). By default,
-	The separator is the space character.
-
--C::
---cpu=<cpu>::
-	Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
-        comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default
-        is to monitor all CPUS.
--U::
---hide-unresolved::
-	Only display entries resolved to a symbol.
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc)
 
 -p::
 --phys-data::
@@ -73,6 +51,9 @@ OPTIONS
 
 RECORD OPTIONS
 --------------
+<command>...::
+	Any command you can specify in a shell.
+
 -e::
 --event <event>::
 	Event selector. Use 'perf mem record -e list' to list available events.
@@ -85,14 +66,65 @@ RECORD OPTIONS
 --all-user::
 	Configure all used events to run in user space.
 
--v::
---verbose::
-	Be more verbose (show counter open errors, etc)
-
 --ldlat <n>::
 	Specify desired latency for loads event. Supported on Intel and Arm64
 	processors only. Ignored on other archs.
 
+REPORT OPTIONS
+--------------
+-i::
+--input=<file>::
+	Input file name.
+
+-C::
+--cpu=<cpu>::
+	Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
+        comma-separated list with no space: 0,1. Ranges of CPUs are specified with -
+	like 0-2. Default is to monitor all CPUS.
+
+-D::
+--dump-raw-samples::
+	Dump the raw decoded samples on the screen in a format that is easy to parse with
+	one sample per line.
+
+-s::
+--sort=<key>::
+	Group result by given key(s) - multiple keys can be specified
+	in CSV format.  The keys are specific to memory samples are:
+	symbol_daddr, symbol_iaddr, dso_daddr, locked, tlb, mem, snoop,
+	dcacheline, phys_daddr, data_page_size, blocked.
+
+	- symbol_daddr: name of data symbol being executed on at the time of sample
+	- symbol_iaddr: name of code symbol being executed on at the time of sample
+	- dso_daddr: name of library or module containing the data being executed
+	             on at the time of the sample
+	- locked: whether the bus was locked at the time of the sample
+	- tlb: type of tlb access for the data at the time of the sample
+	- mem: type of memory access for the data at the time of the sample
+	- snoop: type of snoop (if any) for the data at the time of the sample
+	- dcacheline: the cacheline the data address is on at the time of the sample
+	- phys_daddr: physical address of data being executed on at the time of sample
+	- data_page_size: the data page size of data being executed on at the time of sample
+	- blocked: reason of blocked load access for the data at the time of the sample
+
+	And the default sort keys are changed to local_weight, mem, sym, dso,
+	symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat.
+
+-T::
+--type-profile::
+	Show data-type profile result instead of code symbols.  This requires
+	the debug information and it will change the default sort keys to:
+	mem, snoop, tlb, type.
+
+-U::
+--hide-unresolved::
+	Only display entries resolved to a symbol.
+
+-x::
+--field-separator=<separator>::
+	Specify the field separator used when dump raw samples (-D option). By default,
+	The separator is the space character.
+
 In addition, for report all perf report options are valid, and for record
 all perf record options.
 
-- 
cgit v1.2.3


From 96f30c8f0aa9923aa39b30bcaefeacf88b490231 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Mon, 15 Jul 2024 13:32:42 -0700
Subject: tools build: Correct libsubcmd fixdep dependencies

All built targets need fixdep to be built first, before handling object
dependencies [1]. We're missing one such dependency before the libsubcmd
target.

This resolves .cmd file generation issues such that the following
sequence produces many fewer results:

  $ git clean -xfd tools/
  $ make tools/objtool
  $ grep "cannot find fixdep" $(find tools/objtool -name '*.cmd')

In particular, only a buggy tools/objtool/libsubcmd/.fixdep.o.cmd
remains, due to circular dependencies of fixdep on itself.

Such incomplete .cmd files don't usually cause a direct problem, since
they're designed to fail "open", but they can cause some subtle problems
that would otherwise be handled by proper fixdep'd dependency files. [2]

[1] This problem is better described in commit abb26210a395 ("perf
tools: Force fixdep compilation at the start of the build"). I don't
apply its solution here, because additional recursive make can be a bit
of overkill.

[2] Example failure case:

  cp -arl linux-src linux-src2
  cd linux-src2
  make O=/path/to/out
  cd ../linux-src
  rm -rf ../linux-src2
  make O=/path/to/out

Previously, we'd see errors like:

  make[6]: *** No rule to make target
  '/path/to/linux-src2/tools/include/linux/compiler.h', needed by
  '/path/to/out/tools/bpf/resolve_btfids/libsubcmd/exec-cmd.o'.  Stop.

Now, the properly-fixdep'd .cmd files will ignore a missing
/path/to/linux-src2/...

Signed-off-by: Brian Norris <briannorris@chromium.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Link: https://lore.kernel.org/all/ZGVi9HbI43R5trN8@bhelgaas/
Link: https://lore.kernel.org/all/Zk-C5Eg84yt6_nml@google.com/
Link: https://lore.kernel.org/r/20240715203325.3832977-2-briannorris@chromium.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/subcmd/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/subcmd/Makefile b/tools/lib/subcmd/Makefile
index b87213263a5e..59b09f280e49 100644
--- a/tools/lib/subcmd/Makefile
+++ b/tools/lib/subcmd/Makefile
@@ -76,7 +76,7 @@ include $(srctree)/tools/build/Makefile.include
 
 all: fixdep $(LIBFILE)
 
-$(SUBCMD_IN): FORCE
+$(SUBCMD_IN): fixdep FORCE
 	@$(MAKE) $(build)=libsubcmd
 
 $(LIBFILE): $(SUBCMD_IN)
-- 
cgit v1.2.3


From ea974028a049f2cea4bb6be963ee3e3844a03f6d Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Mon, 15 Jul 2024 13:32:43 -0700
Subject: tools build: Avoid circular .fixdep-in.o.cmd issues

The 'fixdep' tool is used to post-process dependency files for various
reasons, and it runs after every object file generation command. This
even includes 'fixdep' itself.

In Kbuild, this isn't actually a problem, because it uses a single
command to generate fixdep (a compile-and-link command on fixdep.c), and
afterward runs the fixdep command on the accompanying .fixdep.cmd file.

In tools/ builds (which notably is maintained separately from Kbuild),
fixdep is generated in several phases:

 1. fixdep.c -> fixdep-in.o
 2. fixdep-in.o -> fixdep

Thus, fixdep is not available in the post-processing for step 1, and
instead, we generate .cmd files that look like:

  ## from tools/objtool/libsubcmd/.fixdep.o.cmd
  # cannot find fixdep (/path/to/linux/tools/objtool/libsubcmd//fixdep)
  [...]

These invalid .cmd files are benign in some respects, but cause problems
in others (such as the linked reports).

Because the tools/ build system is rather complicated in its own right
(and pointedly different than Kbuild), I choose to simply open-code the
rule for building fixdep, and avoid the recursive-make indirection that
produces the problem in the first place.

Signed-off-by: Brian Norris <briannorris@chromium.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Link: https://lore.kernel.org/all/Zk-C5Eg84yt6_nml@google.com/
Link: https://lore.kernel.org/r/20240715203325.3832977-3-briannorris@chromium.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Build    |  3 ---
 tools/build/Makefile | 11 ++---------
 2 files changed, 2 insertions(+), 12 deletions(-)
 delete mode 100644 tools/build/Build

(limited to 'tools')

diff --git a/tools/build/Build b/tools/build/Build
deleted file mode 100644
index 76d1a4960973..000000000000
--- a/tools/build/Build
+++ /dev/null
@@ -1,3 +0,0 @@
-hostprogs := fixdep
-
-fixdep-y := fixdep.o
diff --git a/tools/build/Makefile b/tools/build/Makefile
index 17cdf01e29a0..fea3cf647f5b 100644
--- a/tools/build/Makefile
+++ b/tools/build/Makefile
@@ -43,12 +43,5 @@ ifneq ($(wildcard $(TMP_O)),)
 	$(Q)$(MAKE) -C feature OUTPUT=$(TMP_O) clean >/dev/null
 endif
 
-$(OUTPUT)fixdep-in.o: FORCE
-	$(Q)$(MAKE) $(build)=fixdep
-
-$(OUTPUT)fixdep: $(OUTPUT)fixdep-in.o
-	$(QUIET_LINK)$(HOSTCC) $(KBUILD_HOSTLDFLAGS) -o $@ $<
-
-FORCE:
-
-.PHONY: FORCE
+$(OUTPUT)fixdep: $(srctree)/tools/build/fixdep.c
+	$(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTLDFLAGS) -o $@ $<
-- 
cgit v1.2.3


From dbb2a7a986971ef43d5a60d235c05491647e16f4 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Mon, 15 Jul 2024 13:32:44 -0700
Subject: tools build: Correct bpf fixdep dependencies

The dependencies in tools/lib/bpf/Makefile are incorrect. Before we
recurse to build $(BPF_IN_STATIC), we need to build its 'fixdep'
executable.

I can't use the usual shortcut from Makefile.include:

  <target>: <sources> fixdep

because its 'fixdep' target relies on $(OUTPUT), and $(OUTPUT) differs
in the parent 'make' versus the child 'make' -- so I imitate it via
open-coding.

I tweak a few $(MAKE) invocations while I'm at it, because
1. I'm adding a new recursive make; and
2. these recursive 'make's print spurious lines about files that are "up
   to date" (which isn't normally a feature in Kbuild subtargets) or
   "jobserver not available" (see [1])

I also need to tweak the assignment of the OUTPUT variable, so that
relative path builds work. For example, for 'make tools/lib/bpf', OUTPUT
is unset, and is usually treated as "cwd" -- but recursive make will
change cwd and so OUTPUT has a new meaning. For consistency, I ensure
OUTPUT is always an absolute path.

And $(Q) gets a backup definition in tools/build/Makefile.include,
because Makefile.include is sometimes included without
tools/build/Makefile, so the "quiet command" stuff doesn't actually work
consistently without it.

After this change, top-level builds result in an empty grep result from:

  $ grep 'cannot find fixdep' $(find tools/ -name '*.cmd')

[1] https://www.gnu.org/software/make/manual/html_node/MAKE-Variable.html
If we're not using $(MAKE) directly, then we need to use more '+'.

Signed-off-by: Brian Norris <briannorris@chromium.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Link: https://lore.kernel.org/r/20240715203325.3832977-4-briannorris@chromium.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.include | 12 +++++++++++-
 tools/lib/bpf/.gitignore     |  1 +
 tools/lib/bpf/Makefile       | 13 ++++++++++---
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/build/Makefile.include b/tools/build/Makefile.include
index 8dadaa0fbb43..0e4de83400ac 100644
--- a/tools/build/Makefile.include
+++ b/tools/build/Makefile.include
@@ -1,8 +1,18 @@
 # SPDX-License-Identifier: GPL-2.0-only
 build := -f $(srctree)/tools/build/Makefile.build dir=. obj
 
+# More than just $(Q), we sometimes want to suppress all command output from a
+# recursive make -- even the 'up to date' printout.
+ifeq ($(V),1)
+  Q ?=
+  SILENT_MAKE = +$(Q)$(MAKE)
+else
+  Q ?= @
+  SILENT_MAKE = +$(Q)$(MAKE) --silent
+endif
+
 fixdep:
-	$(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep
+	$(SILENT_MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep
 
 fixdep-clean:
 	$(Q)$(MAKE) -C $(srctree)/tools/build clean
diff --git a/tools/lib/bpf/.gitignore b/tools/lib/bpf/.gitignore
index 0da84cb9e66d..f02725b123b3 100644
--- a/tools/lib/bpf/.gitignore
+++ b/tools/lib/bpf/.gitignore
@@ -5,3 +5,4 @@ TAGS
 tags
 cscope.*
 /bpf_helper_defs.h
+fixdep
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 2cf892774346..1b22f0f37288 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -108,6 +108,8 @@ MAKEOVERRIDES=
 
 all:
 
+OUTPUT ?= ./
+OUTPUT := $(abspath $(OUTPUT))/
 export srctree OUTPUT CC LD CFLAGS V
 include $(srctree)/tools/build/Makefile.include
 
@@ -141,7 +143,10 @@ all: fixdep
 
 all_cmd: $(CMD_TARGETS) check
 
-$(BPF_IN_SHARED): force $(BPF_GENERATED)
+$(SHARED_OBJDIR) $(STATIC_OBJDIR):
+	$(Q)mkdir -p $@
+
+$(BPF_IN_SHARED): force $(BPF_GENERATED) | $(SHARED_OBJDIR)
 	@(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \
 	(diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \
 	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true
@@ -151,9 +156,11 @@ $(BPF_IN_SHARED): force $(BPF_GENERATED)
 	@(test -f ../../include/uapi/linux/if_xdp.h -a -f ../../../include/uapi/linux/if_xdp.h && ( \
 	(diff -B ../../include/uapi/linux/if_xdp.h ../../../include/uapi/linux/if_xdp.h >/dev/null) || \
 	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true
+	$(SILENT_MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= OUTPUT=$(SHARED_OBJDIR) $(SHARED_OBJDIR)fixdep
 	$(Q)$(MAKE) $(build)=libbpf OUTPUT=$(SHARED_OBJDIR) CFLAGS="$(CFLAGS) $(SHLIB_FLAGS)"
 
-$(BPF_IN_STATIC): force $(BPF_GENERATED)
+$(BPF_IN_STATIC): force $(BPF_GENERATED) | $(STATIC_OBJDIR)
+	$(SILENT_MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= OUTPUT=$(STATIC_OBJDIR) $(STATIC_OBJDIR)fixdep
 	$(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR)
 
 $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h
@@ -263,7 +270,7 @@ install_pkgconfig: $(PC_FILE)
 
 install: install_lib install_pkgconfig install_headers
 
-clean:
+clean: fixdep-clean
 	$(call QUIET_CLEAN, libbpf) $(RM) -rf $(CMD_TARGETS)		     \
 		*~ .*.d .*.cmd LIBBPF-CFLAGS $(BPF_GENERATED)		     \
 		$(SHARED_OBJDIR) $(STATIC_OBJDIR)			     \
-- 
cgit v1.2.3


From 2dc02c26419b984692e20949252ca6cba515abaf Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 3 Aug 2024 14:13:28 -0700
Subject: perf annotate: Use al->data_nr if possible

The data_nr keeps the number of entries in al->data[] so it should use
it when it iterates the array.  The notes->src->nr_events should have
the same number but it'd be natural to use al->data_nr.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240803211332.1107222-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index a2ee4074f768..91ad948c89d5 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1594,13 +1594,12 @@ bool ui__has_annotation(void)
 
 
 static double annotation_line__max_percent(struct annotation_line *al,
-					   struct annotation *notes,
 					   unsigned int percent_type)
 {
 	double percent_max = 0.0;
 	int i;
 
-	for (i = 0; i < notes->src->nr_events; i++) {
+	for (i = 0; i < al->data_nr; i++) {
 		double percent;
 
 		percent = annotation_data__percent(&al->data[i],
@@ -1672,7 +1671,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati
 				     void (*obj__write_graph)(void *obj, int graph))
 
 {
-	double percent_max = annotation_line__max_percent(al, notes, percent_type);
+	double percent_max = annotation_line__max_percent(al, percent_type);
 	int pcnt_width = annotation__pcnt_width(notes),
 	    cycles_width = annotation__cycles_width(notes);
 	bool show_title = false;
@@ -1690,7 +1689,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati
 	if (al->offset != -1 && percent_max != 0.0) {
 		int i;
 
-		for (i = 0; i < notes->src->nr_events; i++) {
+		for (i = 0; i < al->data_nr; i++) {
 			double percent;
 
 			percent = annotation_data__percent(&al->data[i], percent_type);
-- 
cgit v1.2.3


From cb1e8bfc7914747692c06b215539ef80105fddbc Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 3 Aug 2024 14:13:29 -0700
Subject: perf annotate: Set notes->src->nr_events early

We want to use it in different places so make sure it sets properly
in symbol__annotate() before creating the disasm lines.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240803211332.1107222-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 91ad948c89d5..09e6fdf344db 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -925,6 +925,11 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
 			return -1;
 	}
 
+	if (evsel__is_group_event(evsel))
+		notes->src->nr_events = evsel->core.nr_members;
+	else
+		notes->src->nr_events = 1;
+
 	if (annotate_opts.full_addr)
 		notes->src->start = map__objdump_2mem(ms->map, ms->sym->start);
 	else
@@ -1842,10 +1847,7 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel,
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
 	size_t size = symbol__size(sym);
-	int nr_pcnt = 1, err;
-
-	if (evsel__is_group_event(evsel))
-		nr_pcnt = evsel->core.nr_members;
+	int err;
 
 	err = symbol__annotate(ms, evsel, parch);
 	if (err)
@@ -1861,8 +1863,6 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel,
 		return err;
 
 	annotation__init_column_widths(notes, sym);
-	notes->src->nr_events = nr_pcnt;
-
 	annotation__update_column_widths(notes);
 	sym->annotate2 = 1;
 
-- 
cgit v1.2.3


From b00e4d0d93d30a693ee88b2c25f40181d327e53b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 3 Aug 2024 14:13:30 -0700
Subject: perf annotate: Use annotation__pcnt_width() consistently

The annotation__pcnt_width() calculates the screen width for the
overhead (percent) area considering event groups properly.  Use this
function consistently so that we can make sure it has similar output
in different modes.  But there's a difference in stdio and tui output:
stdio uses 8 and tui uses 7 for a percent.

Let's use 8 and adjust the print width in __annotation_line__write()
properly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240803211332.1107222-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate.c | 14 +++++---------
 tools/perf/util/annotate.h |  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 09e6fdf344db..917897fe44a2 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -699,13 +699,13 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
 		       int percent_type)
 {
 	struct disasm_line *dl = container_of(al, struct disasm_line, al);
+	struct annotation *notes = symbol__annotation(sym);
 	static const char *prev_line;
 
 	if (al->offset != -1) {
 		double max_percent = 0.0;
 		int i, nr_percent = 1;
 		const char *color;
-		struct annotation *notes = symbol__annotation(sym);
 
 		for (i = 0; i < al->data_nr; i++) {
 			double percent;
@@ -775,14 +775,11 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
 	} else if (max_lines && printed >= max_lines)
 		return 1;
 	else {
-		int width = symbol_conf.show_total_period ? 12 : 8;
+		int width = annotation__pcnt_width(notes);
 
 		if (queue)
 			return -1;
 
-		if (evsel__is_group_event(evsel))
-			width *= evsel->core.nr_members;
-
 		if (!*al->line)
 			printf(" %*s:\n", width, " ");
 		else
@@ -1111,7 +1108,7 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel)
 	int more = 0;
 	bool context = opts->context;
 	u64 len;
-	int width = symbol_conf.show_total_period ? 12 : 8;
+	int width = annotation__pcnt_width(notes);
 	int graph_dotted_len;
 	char buf[512];
 
@@ -1127,7 +1124,6 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel)
 	len = symbol__size(sym);
 
 	if (evsel__is_group_event(evsel)) {
-		width *= evsel->core.nr_members;
 		evsel__group_desc(evsel, buf, sizeof(buf));
 		evsel_name = buf;
 	}
@@ -1703,10 +1699,10 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati
 			if (symbol_conf.show_total_period) {
 				obj__printf(obj, "%11" PRIu64 " ", al->data[i].he.period);
 			} else if (symbol_conf.show_nr_samples) {
-				obj__printf(obj, "%6" PRIu64 " ",
+				obj__printf(obj, "%7" PRIu64 " ",
 						   al->data[i].he.nr_samples);
 			} else {
-				obj__printf(obj, "%6.2f ", percent);
+				obj__printf(obj, "%7.2f ", percent);
 			}
 		}
 	} else {
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 9ba772f46270..64e70d716ff1 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -339,7 +339,7 @@ static inline int annotation__cycles_width(struct annotation *notes)
 
 static inline int annotation__pcnt_width(struct annotation *notes)
 {
-	return (symbol_conf.show_total_period ? 12 : 7) * notes->src->nr_events;
+	return (symbol_conf.show_total_period ? 12 : 8) * notes->src->nr_events;
 }
 
 static inline bool annotation_line__filter(struct annotation_line *al)
-- 
cgit v1.2.3


From bb588e38290fb7236e7f2c0bcbd720150277ca83 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 3 Aug 2024 14:13:31 -0700
Subject: perf annotate: Set al->data_nr using the notes->src->nr_events

This is a preparation to support skipping empty events.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240803211332.1107222-5-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 85fb0cfedf94..22289003e16d 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -1037,10 +1037,8 @@ static size_t disasm_line_size(int nr)
 struct disasm_line *disasm_line__new(struct annotate_args *args)
 {
 	struct disasm_line *dl = NULL;
-	int nr = 1;
-
-	if (evsel__is_group_event(args->evsel))
-		nr = args->evsel->core.nr_members;
+	struct annotation *notes = symbol__annotation(args->ms.sym);
+	int nr = notes->src->nr_events;
 
 	dl = zalloc(disasm_line_size(nr));
 	if (!dl)
-- 
cgit v1.2.3


From ce533c9bc6deb12535348693fd48fab92a748c2a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 3 Aug 2024 14:13:32 -0700
Subject: perf annotate: Add --skip-empty option

Like in 'perf report', we want to hide empty events in the 'perf annotate'
output.  This is consistent when the option is set in perf report.

For example, the following command would use 3 events including dummy.

  $ perf mem record -a -- perf test -w noploop

  $ perf evlist
  cpu/mem-loads,ldlat=30/P
  cpu/mem-stores/P
  dummy:u

Just using perf annotate with --group will show the all 3 events.

  $ perf annotate --group --stdio | head
   Percent                 |	Source code & Disassembly of ...
  --------------------------------------------------------------
                           : 0     0xe060 <_dl_relocate_object>:
      0.00    0.00    0.00 :    e060:       pushq   %rbp
      0.00    0.00    0.00 :    e061:       movq    %rsp, %rbp
      0.00    0.00    0.00 :    e064:       pushq   %r15
      0.00    0.00    0.00 :    e066:       movq    %rdi, %r15
      0.00    0.00    0.00 :    e069:       pushq   %r14
      0.00    0.00    0.00 :    e06b:       pushq   %r13
      0.00    0.00    0.00 :    e06d:       movl    %edx, %r13d

Now with --skip-empty, it'll hide the last dummy event.

  $ perf annotate --group --stdio --skip-empty | head
   Percent         |	Source code & Disassembly of ...
  ------------------------------------------------------
                   : 0     0xe060 <_dl_relocate_object>:
      0.00    0.00 :    e060:       pushq   %rbp
      0.00    0.00 :    e061:       movq    %rsp, %rbp
      0.00    0.00 :    e064:       pushq   %r15
      0.00    0.00 :    e066:       movq    %rdi, %r15
      0.00    0.00 :    e069:       pushq   %r14
      0.00    0.00 :    e06b:       pushq   %r13
      0.00    0.00 :    e06d:       movl    %edx, %r13d

Committer testing:

  root@x1:~# perf evlist
  cpu_atom/mem-loads,ldlat=30/P
  cpu_atom/mem-stores/P
  dummy:u
  root@x1:~#

Before:

  root@x1:~# perf annotate --group --stdio2 do_lookup_x | head -25
  Samples: 20  of events 'cpu_atom/mem-loads,ldlat=30/P, cpu_atom/mem-stores/P, dummy:u', 4000 Hz, Event count (approx.): 769079, [percent: local period]
  do_lookup_x() /usr/lib64/ld-linux-x86-64.so.2
  Percent                       0x9900 <do_lookup_x>:
                                  pushq      %rbp
                                  movq       %rsp,%rbp
                                  pushq      %r15
                                  pushq      %r14
                                  pushq      %r13
                                  pushq      %r12
                                  pushq      %rbx
                                  subq       $0x88,%rsp
                                  movq       %rdi,-0x50(%rbp)
                                  movl       8(%r9),%edi
                                  movq       0x10(%rbp),%r12
                                  movq       0x28(%rbp),%r10
                                  movq       %rdx,-0x70(%rbp)
                                  movq       %rcx,-0x58(%rbp)
                                  movq       %rdi,%r11
     0.00    5.73    0.00         movq       %r8,-0x68(%rbp)
                                  movq       (%r9),%r8
                                  movl       %esi,%eax
     8.30    0.00    0.00         movl       0x30(%rbp),%r9d
                                  movl       %esi,%r15d
                                  shrl       $6, %eax
                                  movq       %r8,%r13
  root@x1:~#

After:

  root@x1:~# perf annotate --group --skip-empty --stdio2 do_lookup_x | head -25
  Samples: 20  of events 'cpu_atom/mem-loads,ldlat=30/P, cpu_atom/mem-stores/P', 4000 Hz, Event count (approx.): 769079, [percent: local period]
  do_lookup_x() /usr/lib64/ld-linux-x86-64.so.2
  Percent               0x9900 <do_lookup_x>:
                          pushq      %rbp
                          movq       %rsp,%rbp
                          pushq      %r15
                          pushq      %r14
                          pushq      %r13
                          pushq      %r12
                          pushq      %rbx
                          subq       $0x88,%rsp
                          movq       %rdi,-0x50(%rbp)
                          movl       8(%r9),%edi
                          movq       0x10(%rbp),%r12
                          movq       0x28(%rbp),%r10
                          movq       %rdx,-0x70(%rbp)
                          movq       %rcx,-0x58(%rbp)
                          movq       %rdi,%r11
     0.00    5.73         movq       %r8,-0x68(%rbp)
                          movq       (%r9),%r8
                          movl       %esi,%eax
     8.30    0.00         movl       0x30(%rbp),%r9d
                          movl       %esi,%r15d
                          shrl       $6, %eax
                          movq       %r8,%r13
  root@x1:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240803211332.1107222-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-annotate.txt |  3 +++
 tools/perf/builtin-annotate.c              |  2 ++
 tools/perf/util/annotate.c                 | 22 +++++++++++++++++-----
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index b95524bea021..156c5f37b051 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -165,6 +165,9 @@ include::itrace.txt[]
 --type-stat::
 	Show stats for the data type annotation.
 
+--skip-empty::
+	Do not display empty (or dummy) events.
+
 
 SEE ALSO
 --------
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index cf60392b1c19..efcadb7620b8 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -795,6 +795,8 @@ int cmd_annotate(int argc, const char **argv)
 		    "Show stats for the data type annotation"),
 	OPT_BOOLEAN(0, "insn-stat", &annotate.insn_stat,
 		    "Show instruction stats for the data type annotation"),
+	OPT_BOOLEAN(0, "skip-empty", &symbol_conf.skip_empty,
+		    "Do not display empty (or dummy) events in the output"),
 	OPT_END()
 	};
 	int ret;
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 917897fe44a2..eafe8d65052e 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -848,6 +848,10 @@ static void annotation__calc_percent(struct annotation *notes,
 
 			BUG_ON(i >= al->data_nr);
 
+			if (symbol_conf.skip_empty &&
+			    evsel__hists(evsel)->stats.nr_samples == 0)
+				continue;
+
 			data = &al->data[i++];
 
 			calc_percent(notes, evsel, data, al->offset, end);
@@ -901,7 +905,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
 		.options	= &annotate_opts,
 	};
 	struct arch *arch = NULL;
-	int err;
+	int err, nr;
 
 	err = evsel__get_arch(evsel, &arch);
 	if (err < 0)
@@ -922,10 +926,18 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
 			return -1;
 	}
 
-	if (evsel__is_group_event(evsel))
-		notes->src->nr_events = evsel->core.nr_members;
-	else
-		notes->src->nr_events = 1;
+	nr = 0;
+	if (evsel__is_group_event(evsel)) {
+		struct evsel *pos;
+
+		for_each_group_evsel(pos, evsel) {
+			if (symbol_conf.skip_empty &&
+			    evsel__hists(pos)->stats.nr_samples == 0)
+				continue;
+			nr++;
+		}
+	}
+	notes->src->nr_events = nr ? nr : 1;
 
 	if (annotate_opts.full_addr)
 		notes->src->start = map__objdump_2mem(ms->map, ms->sym->start);
-- 
cgit v1.2.3


From 92841d6e23de09f180f948f99ddb40135775cedc Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Sun, 4 Aug 2024 21:30:19 -0400
Subject: selftest/cgroup: Add new test cases to test_cpuset_prs.sh

Add new test cases to test_cpuset_prs.sh to cover corner cases reported
in previous fix commits.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_cpuset_prs.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index 7c08cc153367..7295424502b9 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -321,7 +321,7 @@ TEST_MATRIX=(
 	#  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
 	#  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
 	#
-	# Incorrect change to cpuset.cpus invalidates partition root
+	# Incorrect change to cpuset.cpus[.exclusive] invalidates partition root
 	#
 	# Adding CPUs to partition root that are not in parent's
 	# cpuset.cpus is allowed, but those extra CPUs are ignored.
@@ -365,6 +365,16 @@ TEST_MATRIX=(
 	# cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it
 	"   C0-3     .      .    C4-5     X5     .      .      .     0 A1:0-3,B1:4-5"
 
+	# Child partition root that try to take all CPUs from parent partition
+	# with tasks will remain invalid.
+	" C1-4:P1:S+ P1     .      .       .     .      .      .     0 A1:1-4,A2:1-4 A1:P1,A2:P-1"
+	" C1-4:P1:S+ P1     .      .       .   C1-4     .      .     0 A1,A2:1-4 A1:P1,A2:P1"
+	" C1-4:P1:S+ P1     .      .       T   C1-4     .      .     0 A1:1-4,A2:1-4 A1:P1,A2:P-1"
+
+	# Clearing of cpuset.cpus with a preset cpuset.cpus.exclusive shouldn't
+	# affect cpuset.cpus.exclusive.effective.
+	" C1-4:X3:S+ C1:X3  .      .       .     C      .      .     0 A2:1-4,XA2:3"
+
 	#  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
 	#  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
 	# Failure cases:
-- 
cgit v1.2.3


From e8fc78eb658a1ebcb871e93bbe4f18e9d51fdd3a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Fri, 2 Aug 2024 12:34:36 +0100
Subject: tools: ynl: remove extraneous ; after statements

There are a couple of statements with two following semicolons,
replace these with just one semicolon.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20240802113436.448939-1-colin.i.king@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c
index fcb18a5a6d70..e16cef160bc2 100644
--- a/tools/net/ynl/lib/ynl.c
+++ b/tools/net/ynl/lib/ynl.c
@@ -696,14 +696,14 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse)
 	addr.nl_family = AF_NETLINK;
 	if (bind(ys->socket, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
 		__perr(yse, "unable to bind to a socket address");
-		goto err_close_sock;;
+		goto err_close_sock;
 	}
 
 	memset(&addr, 0, sizeof(addr));
 	addrlen = sizeof(addr);
 	if (getsockname(ys->socket, (struct sockaddr *)&addr, &addrlen) < 0) {
 		__perr(yse, "unable to read socket address");
-		goto err_close_sock;;
+		goto err_close_sock;
 	}
 	ys->portid = addr.nl_pid;
 	ys->seq = random();
-- 
cgit v1.2.3


From 19c91bd8932a2e00f8de76022c780951166be9a0 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 12 Jul 2024 03:51:36 +0000
Subject: memblock tests: include memory_hotplug.h in mmzone.h as kernel dose

In kernel code, memory_hotplug.h is included in mmzone.h instead of in
init.h. Let's sync with kernel.

This is a preparation for move init.h in common include directory.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
CC: Mike Rapoport <rppt@kernel.org>
Link: https://lore.kernel.org/r/20240712035138.24674-1-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/testing/memblock/linux/init.h   | 1 -
 tools/testing/memblock/linux/mmzone.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/memblock/linux/init.h b/tools/testing/memblock/linux/init.h
index 828e0ee0bc6c..4aeddce53310 100644
--- a/tools/testing/memblock/linux/init.h
+++ b/tools/testing/memblock/linux/init.h
@@ -4,7 +4,6 @@
 
 #include <linux/compiler.h>
 #include <asm/export.h>
-#include <linux/memory_hotplug.h>
 
 #define __section(section)              __attribute__((__section__(section)))
 
diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h
index 71546e15bdd3..bb682659a12d 100644
--- a/tools/testing/memblock/linux/mmzone.h
+++ b/tools/testing/memblock/linux/mmzone.h
@@ -3,6 +3,7 @@
 #define _TOOLS_MMZONE_H
 
 #include <linux/atomic.h>
+#include <linux/memory_hotplug.h>
 
 struct pglist_data *first_online_pgdat(void);
 struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
-- 
cgit v1.2.3


From d68c08173b70952cd74bb45075b843f4a637a43b Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 12 Jul 2024 03:51:37 +0000
Subject: memblock tests: include export.h in linkage.h as kernel dose

In kernel code, linkage.h includes export.h. Let's sync with kernel.

This is a preparation for move init.h in common include directory.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
CC: Mike Rapoport <rppt@kernel.org>
Link: https://lore.kernel.org/r/20240712035138.24674-2-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/include/linux/linkage.h       | 2 ++
 tools/testing/memblock/linux/init.h | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h
index bc763d500262..20dee24d7e1b 100644
--- a/tools/include/linux/linkage.h
+++ b/tools/include/linux/linkage.h
@@ -1,4 +1,6 @@
 #ifndef _TOOLS_INCLUDE_LINUX_LINKAGE_H
 #define _TOOLS_INCLUDE_LINUX_LINKAGE_H
 
+#include <linux/export.h>
+
 #endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */
diff --git a/tools/testing/memblock/linux/init.h b/tools/testing/memblock/linux/init.h
index 4aeddce53310..bd74abc5cba6 100644
--- a/tools/testing/memblock/linux/init.h
+++ b/tools/testing/memblock/linux/init.h
@@ -3,7 +3,6 @@
 #define _LINUX_INIT_H
 
 #include <linux/compiler.h>
-#include <asm/export.h>
 
 #define __section(section)              __attribute__((__section__(section)))
 
-- 
cgit v1.2.3


From e2ae9cf39f8806d2d8f5eb0a22ba511804a804ec Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 12 Jul 2024 03:51:38 +0000
Subject: tools/testing: abstract two init.h into common include directory

Currently we have two test suits define its own init.h. This is a little
redundant.

Let's create a init.h in common include directory and merge these two
into it.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
CC: Mike Rapoport <rppt@kernel.org>
CC: Liam R. Howlett <Liam.Howlett@oracle.com>
Link: https://lore.kernel.org/r/20240712035138.24674-3-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/include/linux/compiler.h        |  4 ----
 tools/include/linux/init.h            | 40 +++++++++++++++++++++++++++++++++++
 tools/testing/memblock/linux/init.h   | 32 ----------------------------
 tools/testing/radix-tree/linux/init.h |  2 --
 tools/testing/radix-tree/maple.c      |  2 +-
 5 files changed, 41 insertions(+), 39 deletions(-)
 create mode 100644 tools/include/linux/init.h
 delete mode 100644 tools/testing/memblock/linux/init.h
 delete mode 100644 tools/testing/radix-tree/linux/init.h

(limited to 'tools')

diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 6f7f22ac9da5..4b5a45919ff8 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -126,10 +126,6 @@
 # define unlikely(x)		__builtin_expect(!!(x), 0)
 #endif
 
-#ifndef __init
-# define __init
-#endif
-
 #include <linux/types.h>
 
 /*
diff --git a/tools/include/linux/init.h b/tools/include/linux/init.h
new file mode 100644
index 000000000000..7ed407976dda
--- /dev/null
+++ b/tools/include/linux/init.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_INIT_H_
+#define _TOOLS_LINUX_INIT_H_
+
+#include <linux/compiler.h>
+
+#ifndef __init
+# define __init
+#endif
+
+#ifndef __exit
+# define __exit
+#endif
+
+#define __section(section)              __attribute__((__section__(section)))
+
+#define __initconst
+#define __meminit
+#define __meminitdata
+#define __refdata
+#define __initdata
+
+struct obs_kernel_param {
+	const char *str;
+	int (*setup_func)(char *st);
+	int early;
+};
+
+#define __setup_param(str, unique_id, fn, early)			\
+	static const char __setup_str_##unique_id[] __initconst		\
+		__aligned(1) = str;					\
+	static struct obs_kernel_param __setup_##unique_id		\
+		__used __section(".init.setup")				\
+		__aligned(__alignof__(struct obs_kernel_param)) =	\
+		{ __setup_str_##unique_id, fn, early }
+
+#define early_param(str, fn)						\
+	__setup_param(str, fn, fn, 1)
+
+#endif /*  _TOOLS_LINUX_INIT_H_ */
diff --git a/tools/testing/memblock/linux/init.h b/tools/testing/memblock/linux/init.h
deleted file mode 100644
index bd74abc5cba6..000000000000
--- a/tools/testing/memblock/linux/init.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_INIT_H
-#define _LINUX_INIT_H
-
-#include <linux/compiler.h>
-
-#define __section(section)              __attribute__((__section__(section)))
-
-#define __initconst
-#define __meminit
-#define __meminitdata
-#define __refdata
-#define __initdata
-
-struct obs_kernel_param {
-	const char *str;
-	int (*setup_func)(char *st);
-	int early;
-};
-
-#define __setup_param(str, unique_id, fn, early)			\
-	static const char __setup_str_##unique_id[] __initconst		\
-		__aligned(1) = str;					\
-	static struct obs_kernel_param __setup_##unique_id		\
-		__used __section(".init.setup")				\
-		__aligned(__alignof__(struct obs_kernel_param)) =	\
-		{ __setup_str_##unique_id, fn, early }
-
-#define early_param(str, fn)						\
-	__setup_param(str, fn, fn, 1)
-
-#endif
diff --git a/tools/testing/radix-tree/linux/init.h b/tools/testing/radix-tree/linux/init.h
deleted file mode 100644
index 81563c3dfce7..000000000000
--- a/tools/testing/radix-tree/linux/init.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define __init
-#define __exit
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index cd1cf05503b4..3437292babff 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -14,7 +14,7 @@
 #include "test.h"
 #include <stdlib.h>
 #include <time.h>
-#include "linux/init.h"
+#include <linux/init.h>
 
 #define module_init(x)
 #define module_exit(x)
-- 
cgit v1.2.3


From 39f64e402f659890a99d415eaf63a01f3b80a9a8 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Tue, 6 Aug 2024 01:03:15 +0000
Subject: memblock test: fix implicit declaration of function 'virt_to_phys'

Commit 94ff46de4a73 ("memblock: Move late alloc warning down to phys
alloc") introduce the usage of virt_to_phys(), which is not defined in
memblock tests.

Define it in mm.h to fix the build error.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Link: https://lore.kernel.org/r/20240806010319.29194-1-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/include/linux/mm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h
index cad4f2927983..677c37e4a18c 100644
--- a/tools/include/linux/mm.h
+++ b/tools/include/linux/mm.h
@@ -25,6 +25,12 @@ static inline void *phys_to_virt(unsigned long address)
 	return __va(address);
 }
 
+#define virt_to_phys virt_to_phys
+static inline phys_addr_t virt_to_phys(volatile void *address)
+{
+	return (phys_addr_t)address;
+}
+
 void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid);
 
 static inline void totalram_pages_inc(void)
-- 
cgit v1.2.3


From 9f76c2ade323121f9006f6a529e0795317e16b5c Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Tue, 6 Aug 2024 01:03:16 +0000
Subject: memblock test: add the definition of __setup()

Commit 1e4c64b71c9b ("mm/memblock: Add "reserve_mem" to reserved named
memory at boot up") introduce usage of __setup(), which is not defined
in memblock test.

Define it in init.h to fix the build error.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Link: https://lore.kernel.org/r/20240806010319.29194-2-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/include/linux/init.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/include/linux/init.h b/tools/include/linux/init.h
index 7ed407976dda..51b5cde28639 100644
--- a/tools/include/linux/init.h
+++ b/tools/include/linux/init.h
@@ -34,6 +34,9 @@ struct obs_kernel_param {
 		__aligned(__alignof__(struct obs_kernel_param)) =	\
 		{ __setup_str_##unique_id, fn, early }
 
+#define __setup(str, fn)						\
+	__setup_param(str, fn, fn, 0)
+
 #define early_param(str, fn)						\
 	__setup_param(str, fn, fn, 1)
 
-- 
cgit v1.2.3


From a88cde5769d523e4ae6aad61237e4a5f6bd2309a Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Tue, 6 Aug 2024 01:03:17 +0000
Subject: memblock test: fix implicit declaration of function 'memparse'

Commit 1e4c64b71c9b ("mm/memblock: Add "reserve_mem" to reserved named
memory at boot up") introduce the usage of memparse(), which is not
defined in memblock test.

Add the definition and link it to fix the build.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Link: https://lore.kernel.org/r/20240806010319.29194-3-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/include/linux/string.h          |  1 +
 tools/lib/cmdline.c                   | 53 +++++++++++++++++++++++++++++++++++
 tools/testing/memblock/Makefile       |  2 +-
 tools/testing/memblock/linux/kernel.h |  1 +
 4 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 tools/lib/cmdline.c

(limited to 'tools')

diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
index db5c99318c79..fb8eda3019b5 100644
--- a/tools/include/linux/string.h
+++ b/tools/include/linux/string.h
@@ -47,4 +47,5 @@ extern char * __must_check skip_spaces(const char *);
 extern char *strim(char *);
 
 extern void *memchr_inv(const void *start, int c, size_t bytes);
+extern unsigned long long memparse(const char *ptr, char **retptr);
 #endif /* _TOOLS_LINUX_STRING_H_ */
diff --git a/tools/lib/cmdline.c b/tools/lib/cmdline.c
new file mode 100644
index 000000000000..c85f00f43c5e
--- /dev/null
+++ b/tools/lib/cmdline.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * From lib/cmdline.c
+ */
+#include <stdlib.h>
+
+#if __has_attribute(__fallthrough__)
+# define fallthrough                    __attribute__((__fallthrough__))
+#else
+# define fallthrough                    do {} while (0)  /* fallthrough */
+#endif
+
+unsigned long long memparse(const char *ptr, char **retptr)
+{
+	char *endptr;	/* local pointer to end of parsed string */
+
+	unsigned long long ret = strtoll(ptr, &endptr, 0);
+
+	switch (*endptr) {
+	case 'E':
+	case 'e':
+		ret <<= 10;
+		fallthrough;
+	case 'P':
+	case 'p':
+		ret <<= 10;
+		fallthrough;
+	case 'T':
+	case 't':
+		ret <<= 10;
+		fallthrough;
+	case 'G':
+	case 'g':
+		ret <<= 10;
+		fallthrough;
+	case 'M':
+	case 'm':
+		ret <<= 10;
+		fallthrough;
+	case 'K':
+	case 'k':
+		ret <<= 10;
+		endptr++;
+		fallthrough;
+	default:
+		break;
+	}
+
+	if (retptr)
+		*retptr = endptr;
+
+	return ret;
+}
diff --git a/tools/testing/memblock/Makefile b/tools/testing/memblock/Makefile
index 7a1ca694a982..d80982ccdc20 100644
--- a/tools/testing/memblock/Makefile
+++ b/tools/testing/memblock/Makefile
@@ -8,7 +8,7 @@ LDFLAGS += -fsanitize=address -fsanitize=undefined
 TARGETS = main
 TEST_OFILES = tests/alloc_nid_api.o tests/alloc_helpers_api.o tests/alloc_api.o \
 		  tests/basic_api.o tests/common.o tests/alloc_exact_nid_api.o
-DEP_OFILES = memblock.o lib/slab.o mmzone.o slab.o
+DEP_OFILES = memblock.o lib/slab.o mmzone.o slab.o cmdline.o
 OFILES = main.o $(DEP_OFILES) $(TEST_OFILES)
 EXTR_SRC = ../../../mm/memblock.c
 
diff --git a/tools/testing/memblock/linux/kernel.h b/tools/testing/memblock/linux/kernel.h
index d2f148bd8902..c16d9cd2d1ae 100644
--- a/tools/testing/memblock/linux/kernel.h
+++ b/tools/testing/memblock/linux/kernel.h
@@ -8,5 +8,6 @@
 #include <linux/printk.h>
 #include <linux/linkage.h>
 #include <linux/kconfig.h>
+#include <linux/string.h>
 
 #endif
-- 
cgit v1.2.3


From 8ac13bc7c266102f1068faafa5314522b68ebe65 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Tue, 6 Aug 2024 01:03:18 +0000
Subject: memblock test: fix implicit declaration of function 'isspace'

Commit 1e4c64b71c9b ("mm/memblock: Add "reserve_mem" to reserved named
memory at boot up") introduce usage of isspace().

Let's include <linux/ctype.h> in kernel.h to fix this.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Link: https://lore.kernel.org/r/20240806010319.29194-4-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/testing/memblock/linux/kernel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/memblock/linux/kernel.h b/tools/testing/memblock/linux/kernel.h
index c16d9cd2d1ae..4d1012d5be6e 100644
--- a/tools/testing/memblock/linux/kernel.h
+++ b/tools/testing/memblock/linux/kernel.h
@@ -9,5 +9,6 @@
 #include <linux/linkage.h>
 #include <linux/kconfig.h>
 #include <linux/string.h>
+#include <linux/ctype.h>
 
 #endif
-- 
cgit v1.2.3


From 9e3d665384fca2a1c56283c7a79a968243ef4614 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Tue, 6 Aug 2024 01:03:19 +0000
Subject: memblock test: fix implicit declaration of function 'strscpy'

Commit 1e4c64b71c9b ("mm/memblock: Add "reserve_mem" to reserved named
memory at boot up") introduce the usage of strscpy, which breaks the
memblock test.

Let's define it as strcpy in userspace to fix it.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Link: https://lore.kernel.org/r/20240806010319.29194-5-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/include/linux/string.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
index fb8eda3019b5..41e7fa734922 100644
--- a/tools/include/linux/string.h
+++ b/tools/include/linux/string.h
@@ -12,6 +12,8 @@ void argv_free(char **argv);
 
 int strtobool(const char *s, bool *res);
 
+#define strscpy strcpy
+
 /*
  * glibc based builds needs the extern while uClibc doesn't.
  * However uClibc headers also define __GLIBC__ hence the hack below
-- 
cgit v1.2.3


From 2576b20abdb18d80c22cea783fda80e2235f36dd Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 5 Aug 2024 12:44:20 -0700
Subject: perf test: Add build test for JEVENTS_ARCH=all

Building with JEVENTS_ARCH=all builds all CPU types and allows things
like assertions to check the validity of the input JSON.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Atish Patra <atishp@rivosinc.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Charles Ci-Jyun Wu <dminus@andestech.com>
Cc: Eric Lin <eric.lin@sifive.com>
Cc: Greentime Hu <greentime.hu@sifive.com>
Cc: Guilherme Amadio <amadio@gentoo.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Inochi Amaoto <inochiama@outlook.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Locus Wei-Han Chen <locus84@andestech.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Vincent Chen <vincent.chen@sifive.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xu Yang <xu.yang_2@nxp.com>
Link: https://lore.kernel.org/r/20240805194424.597244-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/make | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index a1f8adf85367..8a9edf758f10 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -70,6 +70,7 @@ make_python_perf_so := $(python_perf_so)
 make_debug          := DEBUG=1
 make_nondistro      := BUILD_NONDISTRO=1
 make_extra_tests    := EXTRA_TESTS=1
+make_jevents_all    := JEVENTS_ARCH=all
 make_no_bpf_skel    := BUILD_BPF_SKEL=0
 make_gen_vmlinux_h  := GEN_VMLINUX_H=1
 make_no_libperl     := NO_LIBPERL=1
@@ -140,6 +141,7 @@ run += make_python_perf_so
 run += make_debug
 run += make_nondistro
 run += make_extra_tests
+run += make_jevents_all
 run += make_no_bpf_skel
 run += make_gen_vmlinux_h
 run += make_no_libperl
-- 
cgit v1.2.3


From b79f9a437a5758c6dee29c0dbcce8db293f088c0 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 5 Aug 2024 12:44:21 -0700
Subject: perf pmu-events: Change dependencies for empty-pmu-events.c test

Switch from $? (all the prerequisites that are newer than the target)
to $^ (all the prerequisites) as touching jevents.py will mean that
empty-pmu-events.c won't be passed to the diff command breaking the
build.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Atish Patra <atishp@rivosinc.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Charles Ci-Jyun Wu <dminus@andestech.com>
Cc: Eric Lin <eric.lin@sifive.com>
Cc: Greentime Hu <greentime.hu@sifive.com>
Cc: Guilherme Amadio <amadio@gentoo.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Inochi Amaoto <inochiama@outlook.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Locus Wei-Han Chen <locus84@andestech.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Vincent Chen <vincent.chen@sifive.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xu Yang <xu.yang_2@nxp.com>
Link: https://lore.kernel.org/r/20240805194424.597244-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/Build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index c3fa43c49706..d941bc9d16e9 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -39,7 +39,7 @@ $(TEST_EMPTY_PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(ME
 
 $(EMPTY_PMU_EVENTS_TEST_LOG): $(EMPTY_PMU_EVENTS_C) $(TEST_EMPTY_PMU_EVENTS_C)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,test)diff -u $? 2> $@ || (cat $@ && false)
+	$(Q)$(call echo-cmd,test)diff -u $^ 2> $@ || (cat $@ && false)
 
 $(PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG) $(EMPTY_PMU_EVENTS_TEST_LOG)
 	$(call rule_mkdir)
-- 
cgit v1.2.3


From c4f74bb61ae01b30067ae3878ca8ba7d677283ae Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 5 Aug 2024 12:44:22 -0700
Subject: perf pmu-events: Remove duplicated ampereone event

OP_SPEC is repeated twice in the file which will break invariants in
'perf list' as discussed in this thread:

  https://lore.kernel.org/linux-perf-users/20240719081651.24853-1-eric.lin@sifive.com/

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Atish Patra <atishp@rivosinc.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Charles Ci-Jyun Wu <dminus@andestech.com>
Cc: Eric Lin <eric.lin@sifive.com>
Cc: Greentime Hu <greentime.hu@sifive.com>
Cc: Guilherme Amadio <amadio@gentoo.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Inochi Amaoto <inochiama@outlook.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Locus Wei-Han Chen <locus84@andestech.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Vincent Chen <vincent.chen@sifive.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xu Yang <xu.yang_2@nxp.com>
Link: https://lore.kernel.org/r/20240805194424.597244-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json
index 18d1f2f76a23..9fe697d12fe0 100644
--- a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json
+++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json
@@ -77,9 +77,6 @@
     {
         "ArchStdEvent": "OP_RETIRED"
     },
-    {
-        "ArchStdEvent": "OP_SPEC"
-    },
     {
         "PublicDescription": "Operation speculatively executed, NOP",
         "EventCode": "0x100",
-- 
cgit v1.2.3


From 4bd380390fcce5c77610bc9067a9a97cfd999402 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 5 Aug 2024 12:44:24 -0700
Subject: perf jevents.py: Ensure event names aren't duplicated

Duplicate event names break invariants in 'perf list'. Assert that an
event name isn't duplicated so that broken JSON won't build.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Atish Patra <atishp@rivosinc.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Charles Ci-Jyun Wu <dminus@andestech.com>
Cc: Eric Lin <eric.lin@sifive.com>
Cc: Greentime Hu <greentime.hu@sifive.com>
Cc: Guilherme Amadio <amadio@gentoo.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Inochi Amaoto <inochiama@outlook.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Locus Wei-Han Chen <locus84@andestech.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Vincent Chen <vincent.chen@sifive.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xu Yang <xu.yang_2@nxp.com>
Link: https://lore.kernel.org/r/20240805194424.597244-5-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/jevents.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py
index fcf0158438b5..1d96b2204e52 100755
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -503,8 +503,11 @@ def print_pending_events() -> None:
 
   first = True
   last_pmu = None
+  last_name = None
   pmus = set()
   for event in sorted(_pending_events, key=event_cmp_key):
+    if last_pmu and last_pmu == event.pmu:
+      assert event.name != last_name, f"Duplicate event: {last_pmu}/{last_name}/ in {_pending_events_tblname}"
     if event.pmu != last_pmu:
       if not first:
         _args.output_file.write('};\n')
@@ -516,6 +519,7 @@ def print_pending_events() -> None:
       pmus.add((event.pmu, pmu_name))
 
     _args.output_file.write(event.to_c_string(metric=False))
+    last_name = event.name
   _pending_events = []
 
   _args.output_file.write(f"""
-- 
cgit v1.2.3


From ff358ada070fa1b64a6368147b2b56cc6a921847 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Wed, 31 Jul 2024 11:08:32 +0000
Subject: selftests/bpf: add negative tests for new VFS based BPF kfuncs

Add a bunch of negative selftests responsible for asserting that the
BPF verifier successfully rejects a BPF program load when the
underlying BPF program misuses one of the newly introduced VFS based
BPF kfuncs.

The following VFS based BPF kfuncs are extensively tested within this
new selftest:

* struct file *bpf_get_task_exe_file(struct task_struct *);
* void bpf_put_file(struct file *);
* int bpf_path_d_path(struct path *, char *, size_t);

Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20240731110833.1834742-3-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/bpf_experimental.h     |  26 ++++
 tools/testing/selftests/bpf/prog_tests/verifier.c  |   2 +
 .../selftests/bpf/progs/verifier_vfs_reject.c      | 161 +++++++++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_vfs_reject.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 828556cdc2f0..b0668f29f7b3 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -195,6 +195,32 @@ extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __ksym;
  */
 extern void bpf_throw(u64 cookie) __ksym;
 
+/* Description
+ *	Acquire a reference on the exe_file member field belonging to the
+ *	mm_struct that is nested within the supplied task_struct. The supplied
+ *	task_struct must be trusted/referenced.
+ * Returns
+ *	A referenced file pointer pointing to the exe_file member field of the
+ *	mm_struct nested in the supplied task_struct, or NULL.
+ */
+extern struct file *bpf_get_task_exe_file(struct task_struct *task) __ksym;
+
+/* Description
+ *	Release a reference on the supplied file. The supplied file must be
+ *	acquired.
+ */
+extern void bpf_put_file(struct file *file) __ksym;
+
+/* Description
+ *	Resolve a pathname for the supplied path and store it in the supplied
+ *	buffer. The supplied path must be trusted/referenced.
+ * Returns
+ *	A positive integer corresponding to the length of the resolved pathname,
+ *	including the NULL termination character, stored in the supplied
+ *	buffer. On error, a negative integer is returned.
+ */
+extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __ksym;
+
 /* This macro must be used to mark the exception callback corresponding to the
  * main program. For example:
  *
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 67a49d12472c..14d74ba2188e 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -85,6 +85,7 @@
 #include "verifier_value_or_null.skel.h"
 #include "verifier_value_ptr_arith.skel.h"
 #include "verifier_var_off.skel.h"
+#include "verifier_vfs_reject.skel.h"
 #include "verifier_xadd.skel.h"
 #include "verifier_xdp.skel.h"
 #include "verifier_xdp_direct_packet_access.skel.h"
@@ -205,6 +206,7 @@ void test_verifier_value(void)                { RUN(verifier_value); }
 void test_verifier_value_illegal_alu(void)    { RUN(verifier_value_illegal_alu); }
 void test_verifier_value_or_null(void)        { RUN(verifier_value_or_null); }
 void test_verifier_var_off(void)              { RUN(verifier_var_off); }
+void test_verifier_vfs_reject(void)	      { RUN(verifier_vfs_reject); }
 void test_verifier_xadd(void)                 { RUN(verifier_xadd); }
 void test_verifier_xdp(void)                  { RUN(verifier_xdp); }
 void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
new file mode 100644
index 000000000000..d6d3f4fcb24c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <linux/limits.h>
+
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+static char buf[PATH_MAX];
+
+SEC("lsm.s/file_open")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
+int BPF_PROG(get_task_exe_file_kfunc_null)
+{
+	struct file *acquired;
+
+	/* Can't pass a NULL pointer to bpf_get_task_exe_file(). */
+	acquired = bpf_get_task_exe_file(NULL);
+	if (!acquired)
+		return 0;
+
+	bpf_put_file(acquired);
+	return 0;
+}
+
+SEC("lsm.s/inode_getxattr")
+__failure __msg("arg#0 pointer type STRUCT task_struct must point to scalar, or struct with scalar")
+int BPF_PROG(get_task_exe_file_kfunc_fp)
+{
+	u64 x;
+	struct file *acquired;
+	struct task_struct *task;
+
+	task = (struct task_struct *)&x;
+	/* Can't pass random frame pointer to bpf_get_task_exe_file(). */
+	acquired = bpf_get_task_exe_file(task);
+	if (!acquired)
+		return 0;
+
+	bpf_put_file(acquired);
+	return 0;
+}
+
+SEC("lsm.s/file_open")
+__failure __msg("R1 must be referenced or trusted")
+int BPF_PROG(get_task_exe_file_kfunc_untrusted)
+{
+	struct file *acquired;
+	struct task_struct *parent;
+
+	/* Walking a trusted struct task_struct returned from
+	 * bpf_get_current_task_btf() yields an untrusted pointer.
+	 */
+	parent = bpf_get_current_task_btf()->parent;
+	/* Can't pass untrusted pointer to bpf_get_task_exe_file(). */
+	acquired = bpf_get_task_exe_file(parent);
+	if (!acquired)
+		return 0;
+
+	bpf_put_file(acquired);
+	return 0;
+}
+
+SEC("lsm.s/file_open")
+__failure __msg("Unreleased reference")
+int BPF_PROG(get_task_exe_file_kfunc_unreleased)
+{
+	struct file *acquired;
+
+	acquired = bpf_get_task_exe_file(bpf_get_current_task_btf());
+	if (!acquired)
+		return 0;
+
+	/* Acquired but never released. */
+	return 0;
+}
+
+SEC("lsm.s/file_open")
+__failure __msg("release kernel function bpf_put_file expects")
+int BPF_PROG(put_file_kfunc_unacquired, struct file *file)
+{
+	/* Can't release an unacquired pointer. */
+	bpf_put_file(file);
+	return 0;
+}
+
+SEC("lsm.s/file_open")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
+int BPF_PROG(path_d_path_kfunc_null)
+{
+	/* Can't pass NULL value to bpf_path_d_path() kfunc. */
+	bpf_path_d_path(NULL, buf, sizeof(buf));
+	return 0;
+}
+
+SEC("lsm.s/task_alloc")
+__failure __msg("R1 must be referenced or trusted")
+int BPF_PROG(path_d_path_kfunc_untrusted_from_argument, struct task_struct *task)
+{
+	struct path *root;
+
+	/* Walking a trusted argument typically yields an untrusted
+	 * pointer. This is one example of that.
+	 */
+	root = &task->fs->root;
+	bpf_path_d_path(root, buf, sizeof(buf));
+	return 0;
+}
+
+SEC("lsm.s/file_open")
+__failure __msg("R1 must be referenced or trusted")
+int BPF_PROG(path_d_path_kfunc_untrusted_from_current)
+{
+	struct path *pwd;
+	struct task_struct *current;
+
+	current = bpf_get_current_task_btf();
+	/* Walking a trusted pointer returned from bpf_get_current_task_btf()
+	 * yields an untrusted pointer.
+	 */
+	pwd = &current->fs->pwd;
+	bpf_path_d_path(pwd, buf, sizeof(buf));
+	return 0;
+}
+
+SEC("lsm.s/file_open")
+__failure __msg("kernel function bpf_path_d_path args#0 expected pointer to STRUCT path but R1 has a pointer to STRUCT file")
+int BPF_PROG(path_d_path_kfunc_type_mismatch, struct file *file)
+{
+	bpf_path_d_path((struct path *)&file->f_task_work, buf, sizeof(buf));
+	return 0;
+}
+
+SEC("lsm.s/file_open")
+__failure __msg("invalid access to map value, value_size=4096 off=0 size=8192")
+int BPF_PROG(path_d_path_kfunc_invalid_buf_sz, struct file *file)
+{
+	/* bpf_path_d_path() enforces a constraint on the buffer size supplied
+	 * by the BPF LSM program via the __sz annotation. buf here is set to
+	 * PATH_MAX, so let's ensure that the BPF verifier rejects BPF_PROG_LOAD
+	 * attempts if the supplied size and the actual size of the buffer
+	 * mismatches.
+	 */
+	bpf_path_d_path(&file->f_path, buf, PATH_MAX * 2);
+	return 0;
+}
+
+SEC("fentry/vfs_open")
+__failure __msg("calling kernel function bpf_path_d_path is not allowed")
+int BPF_PROG(path_d_path_kfunc_non_lsm, struct path *path, struct file *f)
+{
+	/* Calling bpf_path_d_path() from a non-LSM BPF program isn't permitted.
+	 */
+	bpf_path_d_path(path, buf, sizeof(buf));
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 2b399b9b1f995d71f53e94a41e5717db46c39459 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Wed, 31 Jul 2024 11:08:33 +0000
Subject: selftests/bpf: add positive tests for new VFS based BPF kfuncs

Add a bunch of positive selftests which extensively cover the various
contexts and parameters in which the new VFS based BPF kfuncs may be
used from.

Again, the following VFS based BPF kfuncs are thoroughly tested within
this new selftest:
* struct file *bpf_get_task_exe_file(struct task_struct *);
* void bpf_put_file(struct file *);
* int bpf_path_d_path(struct path *, char *, size_t);

Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20240731110833.1834742-4-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |  2 +
 .../selftests/bpf/progs/verifier_vfs_accept.c      | 85 ++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_vfs_accept.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 14d74ba2188e..f8f546eba488 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -85,6 +85,7 @@
 #include "verifier_value_or_null.skel.h"
 #include "verifier_value_ptr_arith.skel.h"
 #include "verifier_var_off.skel.h"
+#include "verifier_vfs_accept.skel.h"
 #include "verifier_vfs_reject.skel.h"
 #include "verifier_xadd.skel.h"
 #include "verifier_xdp.skel.h"
@@ -206,6 +207,7 @@ void test_verifier_value(void)                { RUN(verifier_value); }
 void test_verifier_value_illegal_alu(void)    { RUN(verifier_value_illegal_alu); }
 void test_verifier_value_or_null(void)        { RUN(verifier_value_or_null); }
 void test_verifier_var_off(void)              { RUN(verifier_var_off); }
+void test_verifier_vfs_accept(void)	      { RUN(verifier_vfs_accept); }
 void test_verifier_vfs_reject(void)	      { RUN(verifier_vfs_reject); }
 void test_verifier_xadd(void)                 { RUN(verifier_xadd); }
 void test_verifier_xdp(void)                  { RUN(verifier_xdp); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
new file mode 100644
index 000000000000..a7c0a553aa50
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+static char buf[64];
+
+SEC("lsm.s/file_open")
+__success
+int BPF_PROG(get_task_exe_file_and_put_kfunc_from_current_sleepable)
+{
+	struct file *acquired;
+
+	acquired = bpf_get_task_exe_file(bpf_get_current_task_btf());
+	if (!acquired)
+		return 0;
+
+	bpf_put_file(acquired);
+	return 0;
+}
+
+SEC("lsm/file_open")
+__success
+int BPF_PROG(get_task_exe_file_and_put_kfunc_from_current_non_sleepable, struct file *file)
+{
+	struct file *acquired;
+
+	acquired = bpf_get_task_exe_file(bpf_get_current_task_btf());
+	if (!acquired)
+		return 0;
+
+	bpf_put_file(acquired);
+	return 0;
+}
+
+SEC("lsm.s/task_alloc")
+__success
+int BPF_PROG(get_task_exe_file_and_put_kfunc_from_argument,
+	     struct task_struct *task)
+{
+	struct file *acquired;
+
+	acquired = bpf_get_task_exe_file(task);
+	if (!acquired)
+		return 0;
+
+	bpf_put_file(acquired);
+	return 0;
+}
+
+SEC("lsm.s/inode_getattr")
+__success
+int BPF_PROG(path_d_path_from_path_argument, struct path *path)
+{
+	int ret;
+
+	ret = bpf_path_d_path(path, buf, sizeof(buf));
+	__sink(ret);
+	return 0;
+}
+
+SEC("lsm.s/file_open")
+__success
+int BPF_PROG(path_d_path_from_file_argument, struct file *file)
+{
+	int ret;
+	struct path *path;
+
+	/* The f_path member is a path which is embedded directly within a
+	 * file. Therefore, a pointer to such embedded members are still
+	 * recognized by the BPF verifier as being PTR_TRUSTED as it's
+	 * essentially PTR_TRUSTED w/ a non-zero fixed offset.
+	 */
+	path = &file->f_path;
+	ret = bpf_path_d_path(path, buf, sizeof(buf));
+	__sink(ret);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 2a6b6c9a226279b4f6668450ddb21ae655558087 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 27 Jul 2024 23:37:36 +0900
Subject: selftests: harness: remove unneeded __constructor_order_last()

__constructor_order_last() is unneeded.

If __constructor_order_last() is not called on backward-order systems,
__constructor_order will remain 0 instead of being set to
_CONSTRUCTOR_ORDER_BACKWARD (= -1).

__LIST_APPEND() will still take the 'else' branch, so there is no
difference in the behavior.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c |  6 ------
 tools/testing/selftests/hid/hid_bpf.c                          |  6 ------
 tools/testing/selftests/kselftest_harness.h                    | 10 +---------
 tools/testing/selftests/rtc/rtctest.c                          |  7 -------
 4 files changed, 1 insertion(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c
index ea0cdc37b44f..7ee7492138c6 100644
--- a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c
+++ b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c
@@ -257,12 +257,6 @@ TEST_F(attest_fixture, att_inval_addr)
 	att_inval_addr_test(&self->uvio_attest.meas_addr, _metadata, self);
 }
 
-static void __attribute__((constructor)) __constructor_order_last(void)
-{
-	if (!__constructor_order)
-		__constructor_order = _CONSTRUCTOR_ORDER_BACKWARD;
-}
-
 int main(int argc, char **argv)
 {
 	int fd = open(UV_PATH, O_ACCMODE);
diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index dc0408a831d0..f2bd20ca88bb 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -1331,12 +1331,6 @@ static int libbpf_print_fn(enum libbpf_print_level level,
 	return 0;
 }
 
-static void __attribute__((constructor)) __constructor_order_last(void)
-{
-	if (!__constructor_order)
-		__constructor_order = _CONSTRUCTOR_ORDER_BACKWARD;
-}
-
 int main(int argc, char **argv)
 {
 	/* Use libbpf 1.0 API mode */
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 40723a6a083f..71648d2102cb 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -488,12 +488,6 @@
  * Use once to append a main() to the test file.
  */
 #define TEST_HARNESS_MAIN \
-	static void __attribute__((constructor)) \
-	__constructor_order_last(void) \
-	{ \
-		if (!__constructor_order) \
-			__constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; \
-	} \
 	int main(int argc, char **argv) { \
 		return test_harness_run(argc, argv); \
 	}
@@ -891,7 +885,6 @@ static struct __fixture_metadata *__fixture_list = &_fixture_global;
 static int __constructor_order;
 
 #define _CONSTRUCTOR_ORDER_FORWARD   1
-#define _CONSTRUCTOR_ORDER_BACKWARD -1
 
 static inline void __register_fixture(struct __fixture_metadata *f)
 {
@@ -1337,8 +1330,7 @@ static int test_harness_run(int argc, char **argv)
 
 static void __attribute__((constructor)) __constructor_order_first(void)
 {
-	if (!__constructor_order)
-		__constructor_order = _CONSTRUCTOR_ORDER_FORWARD;
+	__constructor_order = _CONSTRUCTOR_ORDER_FORWARD;
 }
 
 #endif  /* __KSELFTEST_HARNESS_H */
diff --git a/tools/testing/selftests/rtc/rtctest.c b/tools/testing/selftests/rtc/rtctest.c
index 63ce02d1d5cc..9647b14b47c5 100644
--- a/tools/testing/selftests/rtc/rtctest.c
+++ b/tools/testing/selftests/rtc/rtctest.c
@@ -410,13 +410,6 @@ TEST_F_TIMEOUT(rtc, alarm_wkalm_set_minute, 65) {
 	ASSERT_EQ(new, secs);
 }
 
-static void __attribute__((constructor))
-__constructor_order_last(void)
-{
-	if (!__constructor_order)
-		__constructor_order = _CONSTRUCTOR_ORDER_BACKWARD;
-}
-
 int main(int argc, char **argv)
 {
 	switch (argc) {
-- 
cgit v1.2.3


From 2c082b62aeb5e818d9e9588253fc7b73fc5ab81b Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 27 Jul 2024 23:37:37 +0900
Subject: selftests: harness: rename __constructor_order for clarification

Now, __constructor_order is boolean; 1 for forward-order systems,
0 for backward-order systems while parsing __LIST_APPEND().

Change it into a bool variable, and rename it for clarification.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest_harness.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 71648d2102cb..a5a72415e37b 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -818,7 +818,7 @@
 		item->prev = item; \
 		return;	\
 	} \
-	if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \
+	if (__constructor_order_forward) { \
 		item->next = NULL; \
 		item->prev = head->prev; \
 		item->prev->next = item; \
@@ -882,9 +882,7 @@ struct __test_xfail {
 	}
 
 static struct __fixture_metadata *__fixture_list = &_fixture_global;
-static int __constructor_order;
-
-#define _CONSTRUCTOR_ORDER_FORWARD   1
+static bool __constructor_order_forward;
 
 static inline void __register_fixture(struct __fixture_metadata *f)
 {
@@ -935,7 +933,7 @@ static inline bool __test_passed(struct __test_metadata *metadata)
  * list so tests are run in source declaration order.
  * https://gcc.gnu.org/onlinedocs/gccint/Initialization.html
  * However, it seems not all toolchains do this correctly, so use
- * __constructor_order to detect which direction is called first
+ * __constructor_order_foward to detect which direction is called first
  * and adjust list building logic to get things running in the right
  * direction.
  */
@@ -1330,7 +1328,7 @@ static int test_harness_run(int argc, char **argv)
 
 static void __attribute__((constructor)) __constructor_order_first(void)
 {
-	__constructor_order = _CONSTRUCTOR_ORDER_FORWARD;
+	__constructor_order_forward = true;
 }
 
 #endif  /* __KSELFTEST_HARNESS_H */
-- 
cgit v1.2.3


From cfdbfb94b3824b8da3a6b21252d24754d0162ab6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:37:48 -0700
Subject: rcutorture: Add rcutree.nohz_full_patience_delay to TREE07

This commit adds the rcutree.nohz_full_patience_delay=1000 kernel boot
parameter to the TREE07 scenario, on the observation that "if it ain't
tested, it don't work".

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot
index 979edbf4c820..55ce305b2a3d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot
@@ -2,3 +2,4 @@ nohz_full=2-9
 rcutorture.stall_cpu=14
 rcutorture.stall_cpu_holdoff=90
 rcutorture.fwd_progress=0
+rcutree.nohz_full_patience_delay=1000
-- 
cgit v1.2.3


From 8681156c0939a7511c47b9cc462390a83f0e846a Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Tue, 6 Aug 2024 16:09:04 -0700
Subject: selftests/bpf: Add tests for bpf_get_dentry_xattr

Add test for bpf_get_dentry_xattr on hook security_inode_getxattr.
Verify that the kfunc can read the xattr. Also test failing getxattr
from user space by returning non-zero from the LSM bpf program.

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240806230904.71194-4-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/bpf_kfuncs.h           | 11 ++++++-
 tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c |  9 +++++-
 tools/testing/selftests/bpf/progs/test_get_xattr.c | 37 +++++++++++++++++++---
 3 files changed, 50 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index 3b6675ab4086..2eb3483f2fb0 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -45,7 +45,7 @@ extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clo
 
 /* Description
  *  Modify the address of a AF_UNIX sockaddr.
- * Returns__bpf_kfunc
+ * Returns
  *  -EINVAL if the address size is too big or, 0 if the sockaddr was successfully modified.
  */
 extern int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
@@ -78,4 +78,13 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr,
 
 extern bool bpf_session_is_return(void) __ksym __weak;
 extern __u64 *bpf_session_cookie(void) __ksym __weak;
+
+struct dentry;
+/* Description
+ *  Returns xattr of a dentry
+ * Returns
+ *  Error code
+ */
+extern int bpf_get_dentry_xattr(struct dentry *dentry, const char *name,
+			      struct bpf_dynptr *value_ptr) __ksym __weak;
 #endif
diff --git a/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c
index 37056ba73847..5a0b51157451 100644
--- a/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c
+++ b/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c
@@ -16,6 +16,7 @@ static void test_xattr(void)
 {
 	struct test_get_xattr *skel = NULL;
 	int fd = -1, err;
+	int v[32];
 
 	fd = open(testfile, O_CREAT | O_RDONLY, 0644);
 	if (!ASSERT_GE(fd, 0, "create_file"))
@@ -50,7 +51,13 @@ static void test_xattr(void)
 	if (!ASSERT_GE(fd, 0, "open_file"))
 		goto out;
 
-	ASSERT_EQ(skel->bss->found_xattr, 1, "found_xattr");
+	ASSERT_EQ(skel->bss->found_xattr_from_file, 1, "found_xattr_from_file");
+
+	/* Trigger security_inode_getxattr */
+	err = getxattr(testfile, "user.kfuncs", v, sizeof(v));
+	ASSERT_EQ(err, -1, "getxattr_return");
+	ASSERT_EQ(errno, EINVAL, "getxattr_errno");
+	ASSERT_EQ(skel->bss->found_xattr_from_dentry, 1, "found_xattr_from_dentry");
 
 out:
 	close(fd);
diff --git a/tools/testing/selftests/bpf/progs/test_get_xattr.c b/tools/testing/selftests/bpf/progs/test_get_xattr.c
index 7eb2a4e5a3e5..66e737720f7c 100644
--- a/tools/testing/selftests/bpf/progs/test_get_xattr.c
+++ b/tools/testing/selftests/bpf/progs/test_get_xattr.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
 
 #include "vmlinux.h"
+#include <errno.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_kfuncs.h"
@@ -9,10 +10,12 @@
 char _license[] SEC("license") = "GPL";
 
 __u32 monitored_pid;
-__u32 found_xattr;
+__u32 found_xattr_from_file;
+__u32 found_xattr_from_dentry;
 
 static const char expected_value[] = "hello";
-char value[32];
+char value1[32];
+char value2[32];
 
 SEC("lsm.s/file_open")
 int BPF_PROG(test_file_open, struct file *f)
@@ -25,13 +28,37 @@ int BPF_PROG(test_file_open, struct file *f)
 	if (pid != monitored_pid)
 		return 0;
 
-	bpf_dynptr_from_mem(value, sizeof(value), 0, &value_ptr);
+	bpf_dynptr_from_mem(value1, sizeof(value1), 0, &value_ptr);
 
 	ret = bpf_get_file_xattr(f, "user.kfuncs", &value_ptr);
 	if (ret != sizeof(expected_value))
 		return 0;
-	if (bpf_strncmp(value, ret, expected_value))
+	if (bpf_strncmp(value1, ret, expected_value))
 		return 0;
-	found_xattr = 1;
+	found_xattr_from_file = 1;
 	return 0;
 }
+
+SEC("lsm.s/inode_getxattr")
+int BPF_PROG(test_inode_getxattr, struct dentry *dentry, char *name)
+{
+	struct bpf_dynptr value_ptr;
+	__u32 pid;
+	int ret;
+
+	pid = bpf_get_current_pid_tgid() >> 32;
+	if (pid != monitored_pid)
+		return 0;
+
+	bpf_dynptr_from_mem(value2, sizeof(value2), 0, &value_ptr);
+
+	ret = bpf_get_dentry_xattr(dentry, "user.kfuncs", &value_ptr);
+	if (ret != sizeof(expected_value))
+		return 0;
+	if (bpf_strncmp(value2, ret, expected_value))
+		return 0;
+	found_xattr_from_dentry = 1;
+
+	/* return non-zero to fail getxattr from user space */
+	return -EINVAL;
+}
-- 
cgit v1.2.3


From 98f8faea4b6379de2c1483125253b73f7449e6b6 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Fri, 24 May 2024 09:49:55 +0900
Subject: selftests/uprobes: Add a basic uprobe testcase

Add a basic uprobe testcase which checks whether add/remove/trace
operations works on /bin/sh.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../ftrace/test.d/dynevent/add_remove_uprobe.tc    | 26 ++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc
new file mode 100644
index 000000000000..a275decdc880
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc
@@ -0,0 +1,26 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - add/remove/test uprobe events
+# requires: uprobe_events
+
+echo 0 > events/enable
+echo > dynamic_events
+
+echo 'cat /proc/$$/maps' | /bin/sh | \
+	grep "r-xp .*/bin/.*sh$" | \
+	awk '{printf "p:myevent %s:0x%s\n", $6,$3 }' >> uprobe_events
+
+grep -q myevent uprobe_events
+test -d events/uprobes/myevent
+
+echo 1 > events/uprobes/myevent/enable
+echo 'ls' | /bin/sh > /dev/null
+echo 0 > events/uprobes/myevent/enable
+grep -q myevent trace
+
+echo "-:myevent" >> uprobe_events
+! grep -q myevent uprobe_events
+
+echo > uprobe_events
+
+clear_trace
-- 
cgit v1.2.3


From 53af1a4b6a55b3910a253fce7a0893e6d51952be Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 23 May 2024 12:45:41 -0400
Subject: tracing/selftests: Run the ownership test twice

A regression happened where running the ownership test passes on the first
iteration but fails running it a second time. This was caught and fixed,
but a later change brought it back. The regression was missed because the
automated tests only run the tests once per boot.

Change the ownership test to iterate through the tests twice, as this will
catch the regression with a single run.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../ftrace/test.d/00basic/test_ownership.tc        | 34 +++++++++++++---------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
index c45094d1e1d2..71e43a92352a 100644
--- a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
+++ b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
@@ -83,32 +83,38 @@ run_tests() {
 	done
 }
 
-mount -o remount,"$new_options" .
+# Run the tests twice as leftovers can cause issues
+for loop in 1 2 ; do
 
-run_tests
+	echo "Running iteration $loop"
 
-mount -o remount,"$mount_options" .
+	mount -o remount,"$new_options" .
 
-for d in "." "events" "events/sched" "events/sched/sched_switch" "events/sched/sched_switch/enable" $canary; do
-	test "$d" $original_group
-done
+	run_tests
+
+	mount -o remount,"$mount_options" .
+
+	for d in "." "events" "events/sched" "events/sched/sched_switch" "events/sched/sched_switch/enable" $canary; do
+		test "$d" $original_group
+	done
 
 # check instances as well
 
-chgrp $other_group instances
+	chgrp $other_group instances
 
-instance="$(mktemp -u test-XXXXXX)"
+	instance="$(mktemp -u test-XXXXXX)"
 
-mkdir instances/$instance
+	mkdir instances/$instance
 
-cd instances/$instance
+	cd instances/$instance
 
-run_tests
+	run_tests
 
-cd ../..
+	cd ../..
 
-rmdir instances/$instance
+	rmdir instances/$instance
 
-chgrp $original_group instances
+	chgrp $original_group instances
+done
 
 exit 0
-- 
cgit v1.2.3


From b2f70c99edc7e4dcd201abbdcc4b75030973edce Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 6 Aug 2024 23:51:36 -0700
Subject: perf hist: Fix reference counting of branch_info

iter_finish_branch_entry() doesn't put the branch_info from/to map
elements creating memory leaks. This can be seen with:

```
$ perf record -e cycles -b perf test -w noploop
$ perf report -D
...
Direct leak of 984344 byte(s) in 123043 object(s) allocated from:
    #0 0x7fb2654f3bd7 in malloc libsanitizer/asan/asan_malloc_linux.cpp:69
    #1 0x564d3400d10b in map__get util/map.h:186
    #2 0x564d3400d10b in ip__resolve_ams util/machine.c:1981
    #3 0x564d34014d81 in sample__resolve_bstack util/machine.c:2151
    #4 0x564d34094790 in iter_prepare_branch_entry util/hist.c:898
    #5 0x564d34098fa4 in hist_entry_iter__add util/hist.c:1238
    #6 0x564d33d1f0c7 in process_sample_event tools/perf/builtin-report.c:334
    #7 0x564d34031eb7 in perf_session__deliver_event util/session.c:1655
    #8 0x564d3403ba52 in do_flush util/ordered-events.c:245
    #9 0x564d3403ba52 in __ordered_events__flush util/ordered-events.c:324
    #10 0x564d3402d32e in perf_session__process_user_event util/session.c:1708
    #11 0x564d34032480 in perf_session__process_event util/session.c:1877
    #12 0x564d340336ad in reader__read_event util/session.c:2399
    #13 0x564d34033fdc in reader__process_events util/session.c:2448
    #14 0x564d34033fdc in __perf_session__process_events util/session.c:2495
    #15 0x564d34033fdc in perf_session__process_events util/session.c:2661
    #16 0x564d33d27113 in __cmd_report tools/perf/builtin-report.c:1065
    #17 0x564d33d27113 in cmd_report tools/perf/builtin-report.c:1805
    #18 0x564d33e0ccb7 in run_builtin tools/perf/perf.c:350
    #19 0x564d33e0d45e in handle_internal_command tools/perf/perf.c:403
    #20 0x564d33cdd827 in run_argv tools/perf/perf.c:447
    #21 0x564d33cdd827 in main tools/perf/perf.c:561
...
```

Clearing up the map_symbols properly creates maps reference count
issues so resolve those. Resolving this issue doesn't improve peak
heap consumption for the test above.

Committer testing:

  $ sudo dnf install libasan
  $ make -k CORESIGHT=1 EXTRA_CFLAGS="-fsanitize=address" CC=clang O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Yanteng Si <siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/20240807065136.1039977-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index f8ee1cd6929d..c8c1b511f8a7 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -472,7 +472,9 @@ static int hist_entry__init(struct hist_entry *he,
 		memcpy(he->branch_info, template->branch_info,
 		       sizeof(*he->branch_info));
 
+		he->branch_info->from.ms.maps = maps__get(he->branch_info->from.ms.maps);
 		he->branch_info->from.ms.map = map__get(he->branch_info->from.ms.map);
+		he->branch_info->to.ms.maps = maps__get(he->branch_info->to.ms.maps);
 		he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map);
 	}
 
@@ -970,10 +972,21 @@ out:
 	return err;
 }
 
+static void branch_info__exit(struct branch_info *bi)
+{
+	map_symbol__exit(&bi->from.ms);
+	map_symbol__exit(&bi->to.ms);
+	zfree_srcline(&bi->srcline_from);
+	zfree_srcline(&bi->srcline_to);
+}
+
 static int
 iter_finish_branch_entry(struct hist_entry_iter *iter,
 			 struct addr_location *al __maybe_unused)
 {
+	for (int i = 0; i < iter->total; i++)
+		branch_info__exit(&iter->bi[i]);
+
 	zfree(&iter->bi);
 	iter->he = NULL;
 
@@ -1319,10 +1332,7 @@ void hist_entry__delete(struct hist_entry *he)
 	map_symbol__exit(&he->ms);
 
 	if (he->branch_info) {
-		map_symbol__exit(&he->branch_info->from.ms);
-		map_symbol__exit(&he->branch_info->to.ms);
-		zfree_srcline(&he->branch_info->srcline_from);
-		zfree_srcline(&he->branch_info->srcline_to);
+		branch_info__exit(he->branch_info);
 		zfree(&he->branch_info);
 	}
 
-- 
cgit v1.2.3


From 037f1b67e81c2500c94625b089cfd1ecc76a18b5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 5 Aug 2024 16:46:48 -0700
Subject: perf annotate: Cache debuginfo for data type profiling

In find_data_type(), it creates and deletes a debug info whenver it
tries to find data type for a sample.  This is inefficient and it most
likely accesses the same binary again and again.

Let's add a single entry cache the debug info structure for the last DSO.
Depending on sample data, it usually gives me 2~3x (and sometimes more)
speed ups.

Note that this will introduce a little difference in the output due to
the order of checking stack operations.  It used to check the stack ops
before checking the availability of debug info but I moved it after the
symbol check.  So it'll report stack operations in DSOs without debug
info as unknown.  But I think it's ok and better to have the checking
near the caching logic.

Committer testing:

  root@x1:~# perf mem record -a sleep 5s
  root@x1:~# perf evlist
  cpu_atom/mem-loads,ldlat=30/P
  cpu_atom/mem-stores/P
  dummy:u
  root@x1:~# diff -u before after
  --- before	2024-08-08 09:33:53.880780784 -0300
  +++ after	2024-08-08 09:35:13.917325041 -0300
  @@ -81,8 +81,8 @@
   # Overhead  Data Type
   # ........  .........
   #
  -    55.43%  (unknown)
  -    11.61%  (stack operation)
  +    55.56%  (unknown)
  +    11.48%  (stack operation)
        4.93%  struct pcpu_hot
        3.26%  unsigned int
        2.48%  struct

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240805234648.1453689-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 15 ++-------------
 tools/perf/util/annotate-data.h |  2 +-
 tools/perf/util/annotate.c      | 37 +++++++++++++++++++++++++++++++++++++
 tools/perf/util/annotate.h      |  2 ++
 tools/perf/util/session.c       |  2 ++
 5 files changed, 44 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 734acdd8c4b7..f125ac5f0bda 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1345,16 +1345,9 @@ out:
  */
 struct annotated_data_type *find_data_type(struct data_loc_info *dloc)
 {
-	struct annotated_data_type *result = NULL;
 	struct dso *dso = map__dso(dloc->ms->map);
 	Dwarf_Die type_die;
 
-	dloc->di = debuginfo__new(dso__long_name(dso));
-	if (dloc->di == NULL) {
-		pr_debug_dtp("cannot get the debug info\n");
-		return NULL;
-	}
-
 	/*
 	 * The type offset is the same as instruction offset by default.
 	 * But when finding a global variable, the offset won't be valid.
@@ -1364,13 +1357,9 @@ struct annotated_data_type *find_data_type(struct data_loc_info *dloc)
 	dloc->fbreg = -1;
 
 	if (find_data_type_die(dloc, &type_die) < 0)
-		goto out;
-
-	result = dso__findnew_data_type(dso, &type_die);
+		return NULL;
 
-out:
-	debuginfo__delete(dloc->di);
-	return result;
+	return dso__findnew_data_type(dso, &type_die);
 }
 
 static int alloc_data_type_histograms(struct annotated_data_type *adt, int nr_entries)
diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index 992b7ce4bd11..37a1a3b68e0b 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -123,9 +123,9 @@ struct data_loc_info {
 	u64 var_addr;
 	u8 cpumode;
 	struct annotated_op_loc *op;
+	struct debuginfo *di;
 
 	/* These are used internally */
-	struct debuginfo *di;
 	int fbreg;
 	bool fb_cfa;
 
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index eafe8d65052e..a87d2e97e10d 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -25,6 +25,7 @@
 #include "srcline.h"
 #include "units.h"
 #include "debug.h"
+#include "debuginfo.h"
 #include "annotate.h"
 #include "annotate-data.h"
 #include "evsel.h"
@@ -2333,6 +2334,20 @@ u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset,
 	return map__rip_2objdump(ms->map, addr);
 }
 
+static struct debuginfo_cache {
+	struct dso *dso;
+	struct debuginfo *dbg;
+} di_cache;
+
+void debuginfo_cache__delete(void)
+{
+	dso__put(di_cache.dso);
+	di_cache.dso = NULL;
+
+	debuginfo__delete(di_cache.dbg);
+	di_cache.dbg = NULL;
+}
+
 /**
  * hist_entry__get_data_type - find data type for given hist entry
  * @he: hist entry
@@ -2367,6 +2382,27 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he)
 		return NULL;
 	}
 
+	/*
+	 * di_cache holds a pair of values, but code below assumes
+	 * di_cache.dso can be compared/updated and di_cache.dbg can be
+	 * read/updated independently from each other. That assumption only
+	 * holds in single threaded code.
+	 */
+	assert(perf_singlethreaded);
+
+	if (map__dso(ms->map) != di_cache.dso) {
+		dso__put(di_cache.dso);
+		di_cache.dso = dso__get(map__dso(ms->map));
+
+		debuginfo__delete(di_cache.dbg);
+		di_cache.dbg = debuginfo__new(dso__long_name(di_cache.dso));
+	}
+
+	if (di_cache.dbg == NULL) {
+		ann_data_stat.no_dbginfo++;
+		return NULL;
+	}
+
 	/* Make sure it has the disasm of the function */
 	if (symbol__annotate(ms, evsel, &arch) < 0) {
 		ann_data_stat.no_insn++;
@@ -2411,6 +2447,7 @@ retry:
 			.ip = ms->sym->start + dl->al.offset,
 			.cpumode = he->cpumode,
 			.op = op_loc,
+			.di = di_cache.dbg,
 		};
 
 		if (!op_loc->mem_ref && op_loc->segment == INSN_SEG_NONE)
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 64e70d716ff1..0e52558baa01 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -543,4 +543,6 @@ struct annotated_basic_block {
 int annotate_get_basic_blocks(struct symbol *sym, s64 src, s64 dst,
 			      struct list_head *head);
 
+void debuginfo_cache__delete(void);
+
 #endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5596bed1b8c8..f9072e003367 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -36,6 +36,7 @@
 #include "util.h"
 #include "arch/common.h"
 #include "units.h"
+#include "annotate.h"
 #include <internal/lib.h>
 
 #ifdef HAVE_ZSTD_SUPPORT
@@ -304,6 +305,7 @@ void perf_session__delete(struct perf_session *session)
 		return;
 	auxtrace__free(session);
 	auxtrace_index__free(&session->auxtrace_index);
+	debuginfo_cache__delete();
 	perf_session__destroy_kernel_maps(session);
 	perf_decomp__release_events(session->decomp_data.decomp);
 	perf_env__exit(&session->header.env);
-- 
cgit v1.2.3


From 90d78e7b8e57b064f700545081fc9288c1564e92 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 7 Aug 2024 15:31:29 -0700
Subject: perf annotate-data: Show typedef names properly

The die_get_typename() would resolve typedef and get to the original
type.  But sometimes the original type is a struct without name and it
makes the output confusing and hard to read.

This is a diff of perf report -s type before and after the change.
New types such as atomic{,64}_t and sigset_t appeared and the portion
of unnamed struct was reduced.  Also u32, u64 and size_t were splitted
from the base types.

  --- b   2024-08-01 17:02:34.307809952 -0700
  +++ a   2024-08-07 14:17:05.245853999 -0700
  -     2.40%  long unsigned int
  +     2.26%  long unsigned int
  -     1.56%  unsigned int
  +     1.27%  unsigned int
  -     0.98%  struct
  -     0.79%  long long unsigned int
  +     0.58%  long long unsigned int
  +     0.36%  struct
  +     0.27%  atomic64_t
  +     0.22%  u32
  +     0.21%  u64
  +     0.19%  atomic_t
  +     0.13%  size_t
  -     0.08%  struct seqcount_spinlock
  +     0.08%  seqcount_spinlock_t
  +     0.08%  sigset_t
  +     0.08%  __poll_t

Let's use the typedef name directly and the resolved to get the size of
the type.

Committer testing:

  root@x1:~# diff -u before after | head -30
  --- before	2024-08-08 09:35:13.917325041 -0300
  +++ after	2024-08-08 09:37:35.312257905 -0300
  @@ -10,25 +10,27 @@
   # ........  .........
   #
       79.40%  (unknown)
  -     2.28%  union
        1.96%  (stack operation)
  -     1.24%  struct
  +     1.87%  pthread_mutex_t
        0.99%  u32[]
  -     0.92%  unsigned int
        0.77%  struct task_struct
  +     0.75%  U32
        0.75%  struct pcpu_hot
        0.63%  struct qspinlock
  +     0.61%  atomic_t
        0.59%  struct list_head
  -     0.58%  int
        0.53%  struct cfs_rq
        0.51%  BYTE*
  -     0.48%  unsigned char
  +     0.48%  BYTE
        0.48%  long unsigned int
        0.46%  struct rq
        0.41%  struct worker
        0.41%  struct memcg_vmstats_percpu
  +     0.41%  pthread_cond_t
        0.37%  _Bool
  +     0.36%  int
  root@x1:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240807223129.1738004-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 39 ++++++++++++++++++++++++++++++---------
 tools/perf/util/dwarf-aux.c     |  2 +-
 tools/perf/util/dwarf-aux.h     |  2 ++
 3 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index f125ac5f0bda..c18272092a6b 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -217,8 +217,13 @@ static int __add_member_cb(Dwarf_Die *die, void *arg)
 	strbuf_init(&sb, 32);
 	die_get_typename(die, &sb);
 
-	die_get_real_type(die, &member_type);
-	if (dwarf_aggregate_size(&member_type, &size) < 0)
+	__die_get_real_type(die, &member_type);
+	if (dwarf_tag(&member_type) == DW_TAG_typedef)
+		die_get_real_type(&member_type, &die_mem);
+	else
+		die_mem = member_type;
+
+	if (dwarf_aggregate_size(&die_mem, &size) < 0)
 		size = 0;
 
 	if (!dwarf_attr_integrate(die, DW_AT_data_member_location, &attr))
@@ -235,11 +240,11 @@ static int __add_member_cb(Dwarf_Die *die, void *arg)
 	INIT_LIST_HEAD(&member->children);
 	list_add_tail(&member->node, &parent->children);
 
-	tag = dwarf_tag(&member_type);
+	tag = dwarf_tag(&die_mem);
 	switch (tag) {
 	case DW_TAG_structure_type:
 	case DW_TAG_union_type:
-		die_find_child(&member_type, __add_member_cb, member, &die_mem);
+		die_find_child(&die_mem, __add_member_cb, member, &die_mem);
 		break;
 	default:
 		break;
@@ -281,6 +286,10 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso,
 	if (die_get_typename_from_type(type_die, &sb) < 0)
 		strbuf_add(&sb, "(unknown type)", 14);
 	type_name = strbuf_detach(&sb, NULL);
+
+	if (dwarf_tag(type_die) == DW_TAG_typedef)
+		die_get_real_type(type_die, type_die);
+
 	dwarf_aggregate_size(type_die, &size);
 
 	/* Check existing nodes in dso->data_types tree */
@@ -342,6 +351,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
 {
 	Dwarf_Word size;
 	bool is_pointer = true;
+	Dwarf_Die sized_type;
 
 	if (reg == DWARF_REG_PC)
 		is_pointer = false;
@@ -351,7 +361,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
 		is_pointer = false;
 
 	/* Get the type of the variable */
-	if (die_get_real_type(var_die, type_die) == NULL) {
+	if (__die_get_real_type(var_die, type_die) == NULL) {
 		pr_debug_dtp("variable has no type\n");
 		ann_data_stat.no_typeinfo++;
 		return -1;
@@ -365,15 +375,20 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
 	if (is_pointer) {
 		if ((dwarf_tag(type_die) != DW_TAG_pointer_type &&
 		     dwarf_tag(type_die) != DW_TAG_array_type) ||
-		    die_get_real_type(type_die, type_die) == NULL) {
+		    __die_get_real_type(type_die, type_die) == NULL) {
 			pr_debug_dtp("no pointer or no type\n");
 			ann_data_stat.no_typeinfo++;
 			return -1;
 		}
 	}
 
+	if (dwarf_tag(type_die) == DW_TAG_typedef)
+		die_get_real_type(type_die, &sized_type);
+	else
+		sized_type = *type_die;
+
 	/* Get the size of the actual type */
-	if (dwarf_aggregate_size(type_die, &size) < 0) {
+	if (dwarf_aggregate_size(&sized_type, &size) < 0) {
 		pr_debug_dtp("type size is unknown\n");
 		ann_data_stat.invalid_size++;
 		return -1;
@@ -846,6 +861,7 @@ static int check_matching_type(struct type_state *state,
 
 	if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) {
 		int tag = dwarf_tag(&state->regs[reg].type);
+		Dwarf_Die sized_type;
 
 		/*
 		 * Normal registers should hold a pointer (or array) to
@@ -862,13 +878,18 @@ static int check_matching_type(struct type_state *state,
 		pr_debug_dtp("\n");
 
 		/* Remove the pointer and get the target type */
-		if (die_get_real_type(&state->regs[reg].type, type_die) == NULL)
+		if (__die_get_real_type(&state->regs[reg].type, type_die) == NULL)
 			return -1;
 
 		dloc->type_offset = dloc->op->offset;
 
+		if (dwarf_tag(type_die) == DW_TAG_typedef)
+			die_get_real_type(type_die, &sized_type);
+		else
+			sized_type = *type_die;
+
 		/* Get the size of the actual type */
-		if (dwarf_aggregate_size(type_die, &size) < 0 ||
+		if (dwarf_aggregate_size(&sized_type, &size) < 0 ||
 		    (unsigned)dloc->type_offset >= size)
 			return -1;
 
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 44ef968a7ad3..5e080d7e22c2 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -267,7 +267,7 @@ Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
 }
 
 /* Get a type die, but skip qualifiers */
-static Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
+Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
 {
 	int tag;
 
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index 24446412b869..336a3a183a78 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -56,6 +56,8 @@ const char *die_get_decl_file(Dwarf_Die *dw_die);
 /* Get type die */
 Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem);
 
+/* Get a type die, but skip qualifiers */
+Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem);
 /* Get a type die, but skip qualifiers and typedef */
 Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem);
 
-- 
cgit v1.2.3


From ed5bb548cc648de6dbb5894cf95955998e466589 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 7 Aug 2024 21:49:54 -0700
Subject: perf test: Add a new shell test for perf ftrace

  $ sudo ./perf test ftrace -vv
   86: perf ftrace tests:
  --- start ---
  test child forked, pid 1772223
  perf ftrace list test
  syscalls for sleep:
  __x64_sys_nanosleep
  __ia32_sys_nanosleep
  __x64_sys_clock_nanosleep
  __ia32_sys_clock_nanosleep
  perf ftrace list test  [Success]
  perf ftrace trace test
  # tracer: function_graph
  #
  # CPU  DURATION                  FUNCTION CALLS
  # |     |   |                     |   |   |   |
   0)               |  __x64_sys_clock_nanosleep() {
   0)               |    common_nsleep() {
   0)               |      hrtimer_nanosleep() {
   0)               |        do_nanosleep() {
  perf ftrace trace test  [Success]
  perf ftrace latency test
  target function: __x64_sys_clock_nanosleep
  #   DURATION     |      COUNT | GRAPH                                          |
      32 - 64   ms |          1 | ############################################## |
  perf ftrace latency test  [Success]
  perf ftrace profile test
  # Total (us)   Avg (us)   Max (us)      Count   Function
    100136.400 100136.400 100136.400          1   __x64_sys_clock_nanosleep
    100135.200 100135.200 100135.200          1   common_nsleep
    100134.700 100134.700 100134.700          1   hrtimer_nanosleep
    100133.700 100133.700 100133.700          1   do_nanosleep
    100130.600 100130.600 100130.600          1   schedule
       166.868     55.623     80.299          3   scheduler_tick
         5.926      5.926      5.926          1   native_smp_send_reschedule
       301.941    301.941    301.941          1   __x64_sys_execve
       295.786    295.786    295.786          1   do_execveat_common.isra.0
        71.397     35.699     46.403          2   bprm_execve
         2.519      1.260      1.547          2   sched_mm_cid_before_execve
         1.098      0.549      0.686          2   sched_mm_cid_after_execve
  perf ftrace profile test  [Success]
  ---- end(0) ----
   86: perf ftrace tests                                               : Ok

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20240808044954.1775333-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/ftrace.sh | 84 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100755 tools/perf/tests/shell/ftrace.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/ftrace.sh b/tools/perf/tests/shell/ftrace.sh
new file mode 100755
index 000000000000..b1c36d30559a
--- /dev/null
+++ b/tools/perf/tests/shell/ftrace.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+# perf ftrace tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# perf ftrace commands only works for root
+if [ "$(id -u)" != 0 ]; then
+    echo "perf ftrace test  [Skipped: no permission]"
+    exit 2
+fi
+
+output=$(mktemp /tmp/__perf_test.ftrace.XXXXXX)
+
+cleanup() {
+  rm -f "${output}"
+
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  cleanup
+  exit 1
+}
+trap trap_cleanup EXIT TERM INT
+
+# this will be set in test_ftrace_trace()
+target_function=
+
+test_ftrace_list() {
+    echo "perf ftrace list test"
+    perf ftrace -F > "${output}"
+    # this will be used in test_ftrace_trace()
+    sleep_functions=$(grep 'sys_.*sleep$' "${output}")
+    echo "syscalls for sleep:"
+    echo "${sleep_functions}"
+    echo "perf ftrace list test  [Success]"
+}
+
+test_ftrace_trace() {
+    echo "perf ftrace trace test"
+    perf ftrace trace --graph-opts depth=5 sleep 0.1 > "${output}"
+    # it should have some function name contains 'sleep'
+    grep "^#" "${output}"
+    grep -F 'sleep()' "${output}"
+    # find actual syscall function name
+    for FN in ${sleep_functions}; do
+	if grep -q "${FN}" "${output}"; then
+	    target_function="${FN}"
+	    echo "perf ftrace trace test  [Success]"
+	    return
+	fi
+    done
+
+    echo "perf ftrace trace test  [Failure: sleep syscall not found]"
+    exit 1
+}
+
+test_ftrace_latency() {
+    echo "perf ftrace latency test"
+    echo "target function: ${target_function}"
+    perf ftrace latency -T "${target_function}" sleep 0.1 > "${output}"
+    grep "^#" "${output}"
+    grep "###" "${output}"
+    echo "perf ftrace latency test  [Success]"
+}
+
+test_ftrace_profile() {
+    echo "perf ftrace profile test"
+    perf ftrace profile sleep 0.1 > "${output}"
+    grep ^# "${output}"
+    grep sleep "${output}"
+    grep schedule "${output}"
+    grep execve "${output}"
+    echo "perf ftrace profile test  [Success]"
+}
+
+test_ftrace_list
+test_ftrace_trace
+test_ftrace_latency
+test_ftrace_profile
+
+cleanup
+exit 0
-- 
cgit v1.2.3


From 2df5484bbf2f1fa3efa2b6385efc1350d89b7783 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 8 Aug 2024 09:59:40 -0300
Subject: perf tests ftrace: Add pattern check for time, count

In 'perf ftrace profile sleep 0.1' we know that we'll have an specific
kernel function that will take a bit more than 0.1 seconds and will take
place just one time, so we can add a check for that so that we validate
more than just the presence of some functions in the profile.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/lkml/ZrTBo7KACZeuCyLj@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/ftrace.sh | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/ftrace.sh b/tools/perf/tests/shell/ftrace.sh
index b1c36d30559a..a6ee740f0d7e 100755
--- a/tools/perf/tests/shell/ftrace.sh
+++ b/tools/perf/tests/shell/ftrace.sh
@@ -72,6 +72,11 @@ test_ftrace_profile() {
     grep sleep "${output}"
     grep schedule "${output}"
     grep execve "${output}"
+    time_re="[[:space:]]+10[[:digit:]]{4}\.[[:digit:]]{3}"
+    # 100283.000 100283.000 100283.000          1   __x64_sys_clock_nanosleep
+    # Check for one *clock_nanosleep line with a Count of just 1 that takes a bit more than 0.1 seconds
+    # Strip the _x64_sys part to work with other architectures
+    grep -E "^${time_re}${time_re}${time_re}[[:space:]]+1[[:space:]]+.*clock_nanosleep" "${output}"
     echo "perf ftrace profile test  [Success]"
 }
 
-- 
cgit v1.2.3


From 37e2a19c98bf99747ca997be876dfc13f9165e0a Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Thu, 8 Aug 2024 12:37:49 +0200
Subject: perf test pmu: Set uninitialized PMU alias to null

Commit 3e0bf9fde2984469 ("perf pmu: Restore full PMU name wildcard
support") adds a test case "PMU cmdline match" that covers PMU name
wildcard support provided by function perf_pmu__match().

The test works with a wide range of supported combinations of PMU name
matching but omits the case that if the perf_pmu__match() cannot match
the PMU name to the wildcard, it tries to match its alias. However, this
variable is not set up, causing the test case to fail when run with
subprocesses or to segfault if run as a single process.

  ./perf test -vv 9
    9: Sysfs PMU tests                                :
    9.1: Parsing with PMU format directory            : Ok
    9.2: Parsing with PMU event                       : Ok
    9.3: PMU event names                              : Ok
    9.4: PMU name combining                           : Ok
    9.5: PMU name comparison                          : Ok
    9.6: PMU cmdline match                            : FAILED!

  ./perf test -F 9
    9.1: Parsing with PMU format directory            : Ok
    9.2: Parsing with PMU event                       : Ok
    9.3: PMU event names                              : Ok
    9.4: PMU name combining                           : Ok
    9.5: PMU name comparison                          : Ok
  Segmentation fault (core dumped)

Initialize the PMU alias to null for all tests of perf_pmu__match()
as this functionality is not being tested and the alias matching works
exactly the same as the matching of the PMU name.

./perf test -F 9
  9.1: Parsing with PMU format directory                             : Ok
  9.2: Parsing with PMU event                                        : Ok
  9.3: PMU event names                                               : Ok
  9.4: PMU name combining                                            : Ok
  9.5: PMU name comparison                                           : Ok
  9.6: PMU cmdline match                                             : Ok

Fixes: 3e0bf9fde2984469 ("perf pmu: Restore full PMU name wildcard support")
Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Radostin Stoyanov <rstoyano@redhat.com>
Link: https://lore.kernel.org/r/20240808103749.9356-1-vmolnaro@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/pmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index 0b2f04a55d7b..a4730b5dc0d9 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -453,11 +453,13 @@ static int test__name_cmp(struct test_suite *test __maybe_unused, int subtest __
 /**
  * Test perf_pmu__match() that's used to search for a PMU given a name passed
  * on the command line. The name that's passed may also be a filename type glob
- * match.
+ * match. If the name does not match, perf_pmu__match() attempts to match the
+ * alias of the PMU, if provided.
  */
 static int test__pmu_match(struct test_suite *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct perf_pmu test_pmu;
+	test_pmu.alias_name = NULL;
 
 	test_pmu.name = "pmuname";
 	TEST_ASSERT_EQUAL("Exact match", perf_pmu__match(&test_pmu, "pmuname"),	     true);
-- 
cgit v1.2.3


From 599c19397b17d197fc1184bbc950f163a292efc9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 7 Aug 2024 22:46:43 -0700
Subject: perf callchain: Fix stitch LBR memory leaks

The 'struct callchain_cursor_node' has a 'struct map_symbol' whose maps
and map members are reference counted. Ensure these values use a _get
routine to increment the reference counts and use map_symbol__exit() to
release the reference counts.

Do similar for 'struct thread's prev_lbr_cursor, but save the size of
the prev_lbr_cursor array so that it may be iterated.

Ensure that when stitch_nodes are placed on the free list the
map_symbols are exited.

Fix resolve_lbr_callchain_sample() by replacing list_replace_init() to
list_splice_init(), so the whole list is moved and nodes aren't leaked.

A reproduction of the memory leaks is possible with a leak sanitizer
build in the perf report command of:

  ```
  $ perf record -e cycles --call-graph lbr perf test -w thloop
  $ perf report --stitch-lbr
  ```

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Fixes: ff165628d72644e3 ("perf callchain: Stitch LBR call stack")
Signed-off-by: Ian Rogers <irogers@google.com>
[ Basic tests after applying the patch, repeating the example above ]
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240808054644.1286065-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 17 +++++++++++++++--
 tools/perf/util/thread.c  |  4 ++++
 tools/perf/util/thread.h  |  1 +
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 8477edefc299..706be5e4a076 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2270,8 +2270,12 @@ static void save_lbr_cursor_node(struct thread *thread,
 		cursor->curr = cursor->first;
 	else
 		cursor->curr = cursor->curr->next;
+
+	map_symbol__exit(&lbr_stitch->prev_lbr_cursor[idx].ms);
 	memcpy(&lbr_stitch->prev_lbr_cursor[idx], cursor->curr,
 	       sizeof(struct callchain_cursor_node));
+	lbr_stitch->prev_lbr_cursor[idx].ms.maps = maps__get(cursor->curr->ms.maps);
+	lbr_stitch->prev_lbr_cursor[idx].ms.map = map__get(cursor->curr->ms.map);
 
 	lbr_stitch->prev_lbr_cursor[idx].valid = true;
 	cursor->pos++;
@@ -2482,6 +2486,9 @@ static bool has_stitched_lbr(struct thread *thread,
 		memcpy(&stitch_node->cursor, &lbr_stitch->prev_lbr_cursor[i],
 		       sizeof(struct callchain_cursor_node));
 
+		stitch_node->cursor.ms.maps = maps__get(lbr_stitch->prev_lbr_cursor[i].ms.maps);
+		stitch_node->cursor.ms.map = map__get(lbr_stitch->prev_lbr_cursor[i].ms.map);
+
 		if (callee)
 			list_add(&stitch_node->node, &lbr_stitch->lists);
 		else
@@ -2505,6 +2512,8 @@ static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr)
 	if (!thread__lbr_stitch(thread)->prev_lbr_cursor)
 		goto free_lbr_stitch;
 
+	thread__lbr_stitch(thread)->prev_lbr_cursor_size = max_lbr + 1;
+
 	INIT_LIST_HEAD(&thread__lbr_stitch(thread)->lists);
 	INIT_LIST_HEAD(&thread__lbr_stitch(thread)->free_lists);
 
@@ -2560,8 +2569,12 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 						max_lbr, callee);
 
 		if (!stitched_lbr && !list_empty(&lbr_stitch->lists)) {
-			list_replace_init(&lbr_stitch->lists,
-					  &lbr_stitch->free_lists);
+			struct stitch_list *stitch_node;
+
+			list_for_each_entry(stitch_node, &lbr_stitch->lists, node)
+				map_symbol__exit(&stitch_node->cursor.ms);
+
+			list_splice_init(&lbr_stitch->lists, &lbr_stitch->free_lists);
 		}
 		memcpy(&lbr_stitch->prev_sample, sample, sizeof(*sample));
 	}
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 87c59aa9fe38..0ffdd52d86d7 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -476,6 +476,7 @@ void thread__free_stitch_list(struct thread *thread)
 		return;
 
 	list_for_each_entry_safe(pos, tmp, &lbr_stitch->lists, node) {
+		map_symbol__exit(&pos->cursor.ms);
 		list_del_init(&pos->node);
 		free(pos);
 	}
@@ -485,6 +486,9 @@ void thread__free_stitch_list(struct thread *thread)
 		free(pos);
 	}
 
+	for (unsigned int i = 0 ; i < lbr_stitch->prev_lbr_cursor_size; i++)
+		map_symbol__exit(&lbr_stitch->prev_lbr_cursor[i].ms);
+
 	zfree(&lbr_stitch->prev_lbr_cursor);
 	free(thread__lbr_stitch(thread));
 	thread__set_lbr_stitch(thread, NULL);
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 8b4a3c69bad1..6cbf6eb2812e 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -26,6 +26,7 @@ struct lbr_stitch {
 	struct list_head		free_lists;
 	struct perf_sample		prev_sample;
 	struct callchain_cursor_node	*prev_lbr_cursor;
+	unsigned int prev_lbr_cursor_size;
 };
 
 DECLARE_RC_STRUCT(thread) {
-- 
cgit v1.2.3


From 32559b99e0f590700dcc2522d97bc637979bcf61 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 7 Aug 2024 22:46:44 -0700
Subject: perf test: Add set of perf record LBR tests

Adds coverage for LBR operations and LBR callgraph.

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240808054644.1286065-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record_lbr.sh | 161 +++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100755 tools/perf/tests/shell/record_lbr.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record_lbr.sh b/tools/perf/tests/shell/record_lbr.sh
new file mode 100755
index 000000000000..baf168d0ddbb
--- /dev/null
+++ b/tools/perf/tests/shell/record_lbr.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+# perf record LBR tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+if [ ! -f /sys/devices/cpu/caps/branches ]
+then
+  echo "Skip: only x86 CPUs support LBR"
+  exit 2
+fi
+
+err=0
+perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
+
+cleanup() {
+  rm -rf "${perfdata}"
+  rm -rf "${perfdata}".old
+  rm -rf "${perfdata}".txt
+
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  cleanup
+  exit 1
+}
+trap trap_cleanup EXIT TERM INT
+
+
+lbr_callgraph_test() {
+  test="LBR callgraph"
+
+  echo "$test"
+  if ! perf record -e cycles --call-graph lbr -o "${perfdata}" perf test -w thloop
+  then
+    echo "$test [Failed support missing]"
+    if [ $err -eq 0 ]
+    then
+      err=2
+    fi
+    return
+  fi
+
+  if ! perf report --stitch-lbr -i "${perfdata}" > "${perfdata}".txt
+  then
+    cat "${perfdata}".txt
+    echo "$test [Failed in perf report]"
+    err=1
+    return
+  fi
+
+  echo "$test [Success]"
+}
+
+lbr_test() {
+  local branch_flags=$1
+  local test="LBR $2 test"
+  local threshold=$3
+  local out
+  local sam_nr
+  local bs_nr
+  local zero_nr
+  local r
+
+  echo "$test"
+  if ! perf record -e cycles $branch_flags -o "${perfdata}" perf test -w thloop
+  then
+    echo "$test [Failed support missing]"
+    perf record -e cycles $branch_flags -o "${perfdata}" perf test -w thloop || true
+    if [ $err -eq 0 ]
+    then
+      err=2
+    fi
+    return
+  fi
+
+  out=$(perf report -D -i "${perfdata}" 2> /dev/null | grep -A1 'PERF_RECORD_SAMPLE')
+  sam_nr=$(echo "$out" | grep -c 'PERF_RECORD_SAMPLE' || true)
+  if [ $sam_nr -eq 0 ]
+  then
+    echo "$test [Failed no samples captured]"
+    err=1
+    return
+  fi
+  echo "$test: $sam_nr samples"
+
+  bs_nr=$(echo "$out" | grep -c 'branch stack: nr:' || true)
+  if [ $sam_nr -ne $bs_nr ]
+  then
+    echo "$test [Failed samples missing branch stacks]"
+    err=1
+    return
+  fi
+
+  zero_nr=$(echo "$out" | grep -c 'branch stack: nr:0' || true)
+  r=$(($zero_nr * 100 / $bs_nr))
+  if [ $r -gt $threshold ]; then
+    echo "$test [Failed empty br stack ratio exceed $threshold%: $r%]"
+    err=1
+    return
+  fi
+
+  echo "$test [Success]"
+}
+
+parallel_lbr_test() {
+  err=0
+  perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
+  lbr_test "$1" "$2" "$3"
+  cleanup
+  exit $err
+}
+
+lbr_callgraph_test
+
+# Sequential
+lbr_test "-b" "any branch" 2
+lbr_test "-j any_call" "any call" 2
+lbr_test "-j any_ret" "any ret" 2
+lbr_test "-j ind_call" "any indirect call" 2
+lbr_test "-j ind_jmp" "any indirect jump" 100
+lbr_test "-j call" "direct calls" 2
+lbr_test "-j ind_call,u" "any indirect user call" 100
+lbr_test "-a -b" "system wide any branch" 2
+lbr_test "-a -j any_call" "system wide any call" 2
+
+# Parallel
+parallel_lbr_test "-b" "parallel any branch" 100 &
+pid1=$!
+parallel_lbr_test "-j any_call" "parallel any call" 100 &
+pid2=$!
+parallel_lbr_test "-j any_ret" "parallel any ret" 100 &
+pid3=$!
+parallel_lbr_test "-j ind_call" "parallel any indirect call" 100 &
+pid4=$!
+parallel_lbr_test "-j ind_jmp" "parallel any indirect jump" 100 &
+pid5=$!
+parallel_lbr_test "-j call" "parallel direct calls" 100 &
+pid6=$!
+parallel_lbr_test "-j ind_call,u" "parallel any indirect user call" 100 &
+pid7=$!
+parallel_lbr_test "-a -b" "parallel system wide any branch" 100 &
+pid8=$!
+parallel_lbr_test "-a -j any_call" "parallel system wide any call" 100 &
+pid9=$!
+
+for pid in $pid1 $pid2 $pid3 $pid4 $pid5 $pid6 $pid7 $pid8 $pid9
+do
+  set +e
+  wait $pid
+  child_err=$?
+  set -e
+  if ([ $err -eq 2 ] && [ $child_err -eq 1 ]) || [ $err -eq 0 ]
+  then
+    err=$child_err
+  fi
+done
+
+cleanup
+exit $err
-- 
cgit v1.2.3


From 9e9d0a79d34716f8e2590ccab0eee08f4fdfc7f4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 8 Aug 2024 11:26:13 -0300
Subject: perf test shell lbr: Support hybrid x86 systems too

Running on a:

  root@x1:~# grep 'model name' -m1 /proc/cpuinfo
  model name	: 13th Gen Intel(R) Core(TM) i7-1365U
  root@x1:~#

It skips all the tests with:

  root@x1:~# perf test -vvvv LBR
   97: perf record LBR tests:
  --- start ---
  test child forked, pid 2033388
  Skip: only x86 CPUs support LBR
  ---- end(-2) ----
   97: perf record LBR tests                                           : Skip
  root@x1:~#

Because the test checks for the /sys/devices/cpu/caps/branches file,
that isn't present as we have instead:

  root@x1:~# ls -la /sys/devices/cpu*/caps/branches
  -r--r--r--. 1 root root 4096 Aug  8 11:22 /sys/devices/cpu_atom/caps/branches
  -r--r--r--. 1 root root 4096 Aug  8 11:21 /sys/devices/cpu_core/caps/branches
  root@x1:~#

If we check as well for one of those,
/sys/devices/cpu_core/caps/branches, then we don't skip the tests and
all are run on these x86 Intel Hybrid systems as well, passing all of
them:

  root@x1:~# perf test -vvvv LBR
   97: perf record LBR tests:
  --- start ---
  test child forked, pid 2034956
  LBR callgraph
  [ perf record: Woken up 5 times to write data ]
  [ perf record: Captured and wrote 1.812 MB /tmp/__perf_test.perf.data.B2HvQ (8114 samples) ]
  LBR callgraph [Success]
  LBR any branch test
  [ perf record: Woken up 25 times to write data ]
  [ perf record: Captured and wrote 6.382 MB /tmp/__perf_test.perf.data.B2HvQ (8071 samples) ]
  LBR any branch test: 8071 samples
  LBR any branch test [Success]
  LBR any call test
  [ perf record: Woken up 23 times to write data ]
  [ perf record: Captured and wrote 6.208 MB /tmp/__perf_test.perf.data.B2HvQ (8092 samples) ]
  LBR any call test: 8092 samples
  LBR any call test [Success]
  LBR any ret test
  [ perf record: Woken up 24 times to write data ]
  [ perf record: Captured and wrote 6.396 MB /tmp/__perf_test.perf.data.B2HvQ (8093 samples) ]
  LBR any ret test: 8093 samples
  LBR any ret test [Success]
  LBR any indirect call test
  [ perf record: Woken up 25 times to write data ]
  [ perf record: Captured and wrote 6.344 MB /tmp/__perf_test.perf.data.B2HvQ (8067 samples) ]
  LBR any indirect call test: 8067 samples
  LBR any indirect call test [Success]
  LBR any indirect jump test
  [ perf record: Woken up 12 times to write data ]
  [ perf record: Captured and wrote 3.073 MB /tmp/__perf_test.perf.data.B2HvQ (8061 samples) ]
  LBR any indirect jump test: 8061 samples
  LBR any indirect jump test [Success]
  LBR direct calls test
  [ perf record: Woken up 25 times to write data ]
  [ perf record: Captured and wrote 6.380 MB /tmp/__perf_test.perf.data.B2HvQ (8076 samples) ]
  LBR direct calls test: 8076 samples
  LBR direct calls test [Success]
  LBR any indirect user call test
  [ perf record: Woken up 5 times to write data ]
  [ perf record: Captured and wrote 1.597 MB /tmp/__perf_test.perf.data.B2HvQ (8079 samples) ]
  LBR any indirect user call test: 8079 samples
  LBR any indirect user call test [Success]
  LBR system wide any branch test
  [ perf record: Woken up 26 times to write data ]
  [ perf record: Captured and wrote 9.088 MB /tmp/__perf_test.perf.data.B2HvQ (9209 samples) ]
  LBR system wide any branch test: 9209 samples
  LBR system wide any branch test [Success]
  LBR system wide any call test
  [ perf record: Woken up 25 times to write data ]
  [ perf record: Captured and wrote 8.945 MB /tmp/__perf_test.perf.data.B2HvQ (9333 samples) ]
  LBR system wide any call test: 9333 samples
  LBR system wide any call test [Success]
  LBR parallel any branch test
  LBR parallel any call test
  LBR parallel any ret test
  LBR parallel any indirect call test
  LBR parallel any indirect jump test
  LBR parallel direct calls test
  LBR parallel system wide any branch test
  LBR parallel any indirect user call test
  LBR parallel system wide any call test
  [ perf record: Woken up 9 times to write data ]
  [ perf record: Woken up 51 times to write data ]
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Woken up 5 times to write data ]
  [ perf record: Woken up 559 times to write data ]
  [ perf record: Woken up 14 times to write data ]
  [ perf record: Woken up 17 times to write data ]
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Woken up 11 times to write data ]
  [ perf record: Captured and wrote 0.150 MB /tmp/__perf_test.perf.data.lANpR (1909 samples) ]
  [ perf record: Captured and wrote 2.371 MB /tmp/__perf_test.perf.data.Olum8 (3033 samples) ]
  [ perf record: Captured and wrote 1.230 MB /tmp/__perf_test.perf.data.njfJ8 (1742 samples) ]
  [ perf record: Captured and wrote 5.554 MB /tmp/__perf_test.perf.data.4ZTrj (29662 samples) ]
  [ perf record: Captured and wrote 19.906 MB /tmp/__perf_test.perf.data.dlGQt (29576 samples) ]
  [ perf record: Captured and wrote 0.289 MB /tmp/__perf_test.perf.data.CAT7y (4311 samples) ]
  [ perf record: Captured and wrote 3.129 MB /tmp/__perf_test.perf.data.diuKG (3971 samples) ]
  LBR parallel any indirect user call test: 1909 samples
  [ perf record: Captured and wrote 4.858 MB /tmp/__perf_test.perf.data.sVjtN (6130 samples) ]
  LBR parallel any indirect user call test [Success]
  [ perf record: Captured and wrote 3.669 MB /tmp/__perf_test.perf.data.AJtNI (4827 samples) ]
  LBR parallel any indirect jump test: 4311 samples
  LBR parallel any indirect jump test [Success]
  LBR parallel direct calls test: 3033 samples
  LBR parallel direct calls test [Success]
  LBR parallel any indirect call test: 1742 samples
  LBR parallel any indirect call test [Success]
  LBR parallel any call test: 4827 samples
  LBR parallel any call test [Success]
  LBR parallel any branch test: 6130 samples
  LBR parallel any branch test [Success]
  LBR parallel system wide any branch test: 29662 samples
  LBR parallel any ret test: 3971 samples
  LBR parallel any ret test [Success]
  LBR parallel system wide any branch test [Success]
  LBR parallel system wide any call test: 29576 samples
  LBR parallel system wide any call test [Success]
  ---- end(0) ----
   97: perf record LBR tests                                           : Ok
  root@x1:~#

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZrTXftup0H46R8WK@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record_lbr.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record_lbr.sh b/tools/perf/tests/shell/record_lbr.sh
index baf168d0ddbb..32314641217e 100755
--- a/tools/perf/tests/shell/record_lbr.sh
+++ b/tools/perf/tests/shell/record_lbr.sh
@@ -4,7 +4,7 @@
 
 set -e
 
-if [ ! -f /sys/devices/cpu/caps/branches ]
+if [ ! -f /sys/devices/cpu/caps/branches ] && [ ! -f /sys/devices/cpu_core/caps/branches ]
 then
   echo "Skip: only x86 CPUs support LBR"
   exit 2
-- 
cgit v1.2.3


From 3882dccf48f9fbe787b5df3187d708ef348ac860 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 8 Aug 2024 16:05:57 +0100
Subject: bpf/bpf_get,set_sockopt: add option to set TCP-BPF sock ops flags

Currently the only opportunity to set sock ops flags dictating
which callbacks fire for a socket is from within a TCP-BPF sockops
program.  This is problematic if the connection is already set up
as there is no further chance to specify callbacks for that socket.
Add TCP_BPF_SOCK_OPS_CB_FLAGS to bpf_setsockopt() and bpf_getsockopt()
to allow users to specify callbacks later, either via an iterator
over sockets or via a socket-specific program triggered by a
setsockopt() on the socket.

Previous discussion on this here [1].

[1] https://lore.kernel.org/bpf/f42f157b-6e52-dd4d-3d97-9b86c84c0b00@oracle.com/

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/r/20240808150558.1035626-2-alan.maguire@oracle.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/uapi/linux/bpf.h       |  3 ++-
 net/core/filter.c              | 16 ++++++++++++++++
 tools/include/uapi/linux/bpf.h |  3 ++-
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 35bcf52dbc65..e05b39e39c3f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2851,7 +2851,7 @@ union bpf_attr {
  * 		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
  * 		  **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**,
  * 		  **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**,
- * 		  **TCP_BPF_RTO_MIN**.
+ *		  **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**.
  * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  * 		* **IPPROTO_IPV6**, which supports the following *optname*\ s:
  * 		  **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**.
@@ -7080,6 +7080,7 @@ enum {
 	TCP_BPF_SYN		= 1005, /* Copy the TCP header */
 	TCP_BPF_SYN_IP		= 1006, /* Copy the IP[46] and TCP header */
 	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
+	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
 };
 
 enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index f3c72cf86099..d96a50f3f016 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5278,6 +5278,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
 			return -EINVAL;
 		inet_csk(sk)->icsk_rto_min = timeout;
 		break;
+	case TCP_BPF_SOCK_OPS_CB_FLAGS:
+		if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
+			return -EINVAL;
+		tp->bpf_sock_ops_cb_flags = val;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -5366,6 +5371,17 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
 		if (*optlen < 1)
 			return -EINVAL;
 		break;
+	case TCP_BPF_SOCK_OPS_CB_FLAGS:
+		if (*optlen != sizeof(int))
+			return -EINVAL;
+		if (getopt) {
+			struct tcp_sock *tp = tcp_sk(sk);
+			int cb_flags = tp->bpf_sock_ops_cb_flags;
+
+			memcpy(optval, &cb_flags, *optlen);
+			return 0;
+		}
+		return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
 	default:
 		if (getopt)
 			return -EINVAL;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 35bcf52dbc65..e05b39e39c3f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2851,7 +2851,7 @@ union bpf_attr {
  * 		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
  * 		  **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**,
  * 		  **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**,
- * 		  **TCP_BPF_RTO_MIN**.
+ *		  **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**.
  * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  * 		* **IPPROTO_IPV6**, which supports the following *optname*\ s:
  * 		  **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**.
@@ -7080,6 +7080,7 @@ enum {
 	TCP_BPF_SYN		= 1005, /* Copy the TCP header */
 	TCP_BPF_SYN_IP		= 1006, /* Copy the IP[46] and TCP header */
 	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
+	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
 };
 
 enum {
-- 
cgit v1.2.3


From d53050934e66dbee64caed1309cef963a416c52f Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 8 Aug 2024 16:05:58 +0100
Subject: selftests/bpf: add sockopt tests for TCP_BPF_SOCK_OPS_CB_FLAGS

Add tests to set TCP sockopt TCP_BPF_SOCK_OPS_CB_FLAGS via
bpf_setsockopt() and use a cgroup/getsockopt program to retrieve
the value to verify it was set.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/r/20240808150558.1035626-3-alan.maguire@oracle.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/setget_sockopt.c      | 47 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/setget_sockopt.c | 26 ++++++++++--
 2 files changed, 70 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
index 7d4a9b3d3722..e12255121c15 100644
--- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
@@ -154,6 +154,51 @@ err_out:
 	close(sfd);
 }
 
+static void test_nonstandard_opt(int family)
+{
+	struct setget_sockopt__bss *bss = skel->bss;
+	struct bpf_link *getsockopt_link = NULL;
+	int sfd = -1, fd = -1, cfd = -1, flags;
+	socklen_t flagslen = sizeof(flags);
+
+	memset(bss, 0, sizeof(*bss));
+
+	sfd = start_server(family, SOCK_STREAM,
+			   family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
+	if (!ASSERT_GE(sfd, 0, "start_server"))
+		return;
+
+	fd = connect_to_fd(sfd, 0);
+	if (!ASSERT_GE(fd, 0, "connect_to_fd_server"))
+		goto err_out;
+
+	/* cgroup/getsockopt prog will intercept getsockopt() below and
+	 * retrieve the tcp socket bpf_sock_ops_cb_flags value for the
+	 * accept()ed socket; this was set earlier in the passive established
+	 * callback for the accept()ed socket via bpf_setsockopt().
+	 */
+	getsockopt_link = bpf_program__attach_cgroup(skel->progs._getsockopt, cg_fd);
+	if (!ASSERT_OK_PTR(getsockopt_link, "getsockopt prog"))
+		goto err_out;
+
+	cfd = accept(sfd, NULL, 0);
+	if (!ASSERT_GE(cfd, 0, "accept"))
+		goto err_out;
+
+	if (!ASSERT_OK(getsockopt(cfd, SOL_TCP, TCP_BPF_SOCK_OPS_CB_FLAGS, &flags, &flagslen),
+		       "getsockopt_flags"))
+		goto err_out;
+	ASSERT_EQ(flags & BPF_SOCK_OPS_STATE_CB_FLAG, BPF_SOCK_OPS_STATE_CB_FLAG,
+		  "cb_flags_set");
+err_out:
+	close(sfd);
+	if (fd != -1)
+		close(fd);
+	if (cfd != -1)
+		close(cfd);
+	bpf_link__destroy(getsockopt_link);
+}
+
 void test_setget_sockopt(void)
 {
 	cg_fd = test__join_cgroup(CG_NAME);
@@ -191,6 +236,8 @@ void test_setget_sockopt(void)
 	test_udp(AF_INET);
 	test_ktls(AF_INET6);
 	test_ktls(AF_INET);
+	test_nonstandard_opt(AF_INET);
+	test_nonstandard_opt(AF_INET6);
 
 done:
 	setget_sockopt__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c
index 60518aed1ffc..6dd4318debbf 100644
--- a/tools/testing/selftests/bpf/progs/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c
@@ -59,6 +59,8 @@ static const struct sockopt_test sol_tcp_tests[] = {
 	{ .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, },
 	{ .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, },
 	{ .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, },
+	{ .opt = TCP_BPF_SOCK_OPS_CB_FLAGS, .new = BPF_SOCK_OPS_ALL_CB_FLAGS,
+	  .expected = BPF_SOCK_OPS_ALL_CB_FLAGS, },
 	{ .opt = 0, },
 };
 
@@ -353,11 +355,30 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family,
 	return 1;
 }
 
+SEC("cgroup/getsockopt")
+int _getsockopt(struct bpf_sockopt *ctx)
+{
+	struct bpf_sock *sk = ctx->sk;
+	int *optval = ctx->optval;
+	struct tcp_sock *tp;
+
+	if (!sk || ctx->level != SOL_TCP || ctx->optname != TCP_BPF_SOCK_OPS_CB_FLAGS)
+		return 1;
+
+	tp = bpf_core_cast(sk, struct tcp_sock);
+	if (ctx->optval + sizeof(int) <= ctx->optval_end) {
+		*optval = tp->bpf_sock_ops_cb_flags;
+		ctx->retval = 0;
+	}
+	return 1;
+}
+
 SEC("sockops")
 int skops_sockopt(struct bpf_sock_ops *skops)
 {
 	struct bpf_sock *bpf_sk = skops->sk;
 	struct sock *sk;
+	int flags;
 
 	if (!bpf_sk)
 		return 1;
@@ -384,9 +405,8 @@ int skops_sockopt(struct bpf_sock_ops *skops)
 		nr_passive += !(bpf_test_sockopt(skops, sk) ||
 				test_tcp_maxseg(skops, sk) ||
 				test_tcp_saved_syn(skops, sk));
-		bpf_sock_ops_cb_flags_set(skops,
-					  skops->bpf_sock_ops_cb_flags |
-					  BPF_SOCK_OPS_STATE_CB_FLAG);
+		flags = skops->bpf_sock_ops_cb_flags | BPF_SOCK_OPS_STATE_CB_FLAG;
+		bpf_setsockopt(skops, SOL_TCP, TCP_BPF_SOCK_OPS_CB_FLAGS, &flags, sizeof(flags));
 		break;
 	case BPF_SOCK_OPS_STATE_CB:
 		if (skops->args[1] == BPF_TCP_CLOSE_WAIT)
-- 
cgit v1.2.3


From 754283ce8326fdce75d589c99cc58456199c123d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 28 Jul 2024 22:34:11 +0200
Subject: tools/nolibc: pass argc, argv and envp to constructors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 2005 glibc has passed argc, argv, and envp to all constructors.
As it is cheap and easy to do so, mirror that behaviour in nolibc.
This makes it easier to migrate applications to nolibc.

Link: https://lore.kernel.org/r/20240728-nolibc-constructor-args-v1-1-36d0bf5cd4c0@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/crt.h                   | 23 ++++++++++++-----------
 tools/testing/selftests/nolibc/nolibc-test.c |  5 +++--
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h
index 43b551468c2a..ac291574f6c0 100644
--- a/tools/include/nolibc/crt.h
+++ b/tools/include/nolibc/crt.h
@@ -13,11 +13,11 @@ const unsigned long *_auxv __attribute__((weak));
 static void __stack_chk_init(void);
 static void exit(int);
 
-extern void (*const __preinit_array_start[])(void) __attribute__((weak));
-extern void (*const __preinit_array_end[])(void) __attribute__((weak));
+extern void (*const __preinit_array_start[])(int, char **, char**) __attribute__((weak));
+extern void (*const __preinit_array_end[])(int, char **, char**) __attribute__((weak));
 
-extern void (*const __init_array_start[])(void) __attribute__((weak));
-extern void (*const __init_array_end[])(void) __attribute__((weak));
+extern void (*const __init_array_start[])(int, char **, char**) __attribute__((weak));
+extern void (*const __init_array_end[])(int, char **, char**) __attribute__((weak));
 
 extern void (*const __fini_array_start[])(void) __attribute__((weak));
 extern void (*const __fini_array_end[])(void) __attribute__((weak));
@@ -29,7 +29,8 @@ void _start_c(long *sp)
 	char **argv;
 	char **envp;
 	int exitcode;
-	void (* const *func)(void);
+	void (* const *ctor_func)(int, char **, char **);
+	void (* const *dtor_func)(void);
 	const unsigned long *auxv;
 	/* silence potential warning: conflicting types for 'main' */
 	int _nolibc_main(int, char **, char **) __asm__ ("main");
@@ -66,16 +67,16 @@ void _start_c(long *sp)
 		;
 	_auxv = auxv;
 
-	for (func = __preinit_array_start; func < __preinit_array_end; func++)
-		(*func)();
-	for (func = __init_array_start; func < __init_array_end; func++)
-		(*func)();
+	for (ctor_func = __preinit_array_start; ctor_func < __preinit_array_end; ctor_func++)
+		(*ctor_func)(argc, argv, envp);
+	for (ctor_func = __init_array_start; ctor_func < __init_array_end; ctor_func++)
+		(*ctor_func)(argc, argv, envp);
 
 	/* go to application */
 	exitcode = _nolibc_main(argc, argv, envp);
 
-	for (func = __fini_array_end; func > __fini_array_start;)
-		(*--func)();
+	for (dtor_func = __fini_array_end; dtor_func > __fini_array_start;)
+		(*--dtor_func)();
 
 	exit(exitcode);
 }
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 093d0512f4c5..0800b10fc3f7 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -686,9 +686,10 @@ static void constructor1(void)
 }
 
 __attribute__((constructor))
-static void constructor2(void)
+static void constructor2(int argc, char **argv, char **envp)
 {
-	constructor_test_value *= 2;
+	if (argc && argv && envp)
+		constructor_test_value *= 2;
 }
 
 int run_startup(int min, int max)
-- 
cgit v1.2.3


From 3ade6ce1255e6e97f91b8ba77408dce9d2292df2 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Tue, 6 Aug 2024 08:38:09 -0700
Subject: selftests: rds: add testing infrastructure

This adds some basic self-testing infrastructure for RDS-TCP.

Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/dev-tools/gcov.rst           |  11 ++
 MAINTAINERS                                |   1 +
 tools/testing/selftests/Makefile           |   1 +
 tools/testing/selftests/net/rds/Makefile   |  12 ++
 tools/testing/selftests/net/rds/README.txt |  41 +++++
 tools/testing/selftests/net/rds/config.sh  |  53 ++++++
 tools/testing/selftests/net/rds/run.sh     | 224 ++++++++++++++++++++++++
 tools/testing/selftests/net/rds/test.py    | 262 +++++++++++++++++++++++++++++
 8 files changed, 605 insertions(+)
 create mode 100644 tools/testing/selftests/net/rds/Makefile
 create mode 100644 tools/testing/selftests/net/rds/README.txt
 create mode 100755 tools/testing/selftests/net/rds/config.sh
 create mode 100755 tools/testing/selftests/net/rds/run.sh
 create mode 100644 tools/testing/selftests/net/rds/test.py

(limited to 'tools')

diff --git a/Documentation/dev-tools/gcov.rst b/Documentation/dev-tools/gcov.rst
index 5fce2b06f229..dbd26b02ff3c 100644
--- a/Documentation/dev-tools/gcov.rst
+++ b/Documentation/dev-tools/gcov.rst
@@ -75,6 +75,17 @@ Only files which are linked to the main kernel image or are compiled as
 kernel modules are supported by this mechanism.
 
 
+Module specific configs
+-----------------------
+
+Gcov kernel configs for specific modules are described below:
+
+CONFIG_GCOV_PROFILE_RDS:
+        Enables GCOV profiling on RDS for checking which functions or
+        lines are executed. This config is used by the rds selftest to
+        generate coverage reports. If left unset the report is omitted.
+
+
 Files
 -----
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 1a7ea5c7c710..7b291c3a9aa4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19197,6 +19197,7 @@ S:	Supported
 W:	https://oss.oracle.com/projects/rds/
 F:	Documentation/networking/rds.rst
 F:	net/rds/
+F:	tools/testing/selftests/net/rds/
 
 RDT - RESOURCE ALLOCATION
 M:	Fenghua Yu <fenghua.yu@intel.com>
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index bc8fe9e8f7f2..a5f1c0c27dff 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -68,6 +68,7 @@ TARGETS += net/mptcp
 TARGETS += net/openvswitch
 TARGETS += net/tcp_ao
 TARGETS += net/netfilter
+TARGETS += net/rds
 TARGETS += nsfs
 TARGETS += perf_events
 TARGETS += pidfd
diff --git a/tools/testing/selftests/net/rds/Makefile b/tools/testing/selftests/net/rds/Makefile
new file mode 100644
index 000000000000..da9714bc7aad
--- /dev/null
+++ b/tools/testing/selftests/net/rds/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+all:
+	@echo mk_build_dir="$(shell pwd)" > include.sh
+
+TEST_PROGS := run.sh \
+	include.sh \
+	test.py
+
+EXTRA_CLEAN := /tmp/rds_logs
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/rds/README.txt b/tools/testing/selftests/net/rds/README.txt
new file mode 100644
index 000000000000..cbde2951ab13
--- /dev/null
+++ b/tools/testing/selftests/net/rds/README.txt
@@ -0,0 +1,41 @@
+RDS self-tests
+==============
+
+These scripts provide a coverage test for RDS-TCP by creating two
+network namespaces and running rds packets between them. A loopback
+network is provisioned with optional probability of packet loss or
+corruption. A workload of 50000 hashes, each 64 characters in size,
+are passed over an RDS socket on this test network. A passing test means
+the RDS-TCP stack was able to recover properly.  The provided config.sh
+can be used to compile the kernel with the necessary gcov options.  The
+kernel may optionally be configured to omit the coverage report as well.
+
+USAGE:
+	run.sh [-d logdir] [-l packet_loss] [-c packet_corruption]
+	       [-u packet_duplcate]
+
+OPTIONS:
+	-d	Log directory.  Defaults to tools/testing/selftests/net/rds/rds_logs
+
+	-l	Simulates a percentage of packet loss
+
+	-c	Simulates a percentage of packet corruption
+
+	-u	Simulates a percentage of packet duplication.
+
+EXAMPLE:
+
+    # Create a suitable gcov enabled .config
+    tools/testing/selftests/net/rds/config.sh -g
+
+    # Alternatly create a gcov disabled .config
+    tools/testing/selftests/net/rds/config.sh
+
+    # build the kernel
+    vng --build  --config tools/testing/selftests/net/config
+
+    # launch the tests in a VM
+    vng -v --rwdir ./ --run . --user root --cpus 4 -- \
+        "export PYTHONPATH=tools/testing/selftests/net/; tools/testing/selftests/net/rds/run.sh"
+
+An HTML coverage report will be output in tools/testing/selftests/net/rds/rds_logs/coverage/.
diff --git a/tools/testing/selftests/net/rds/config.sh b/tools/testing/selftests/net/rds/config.sh
new file mode 100755
index 000000000000..791c8dbe1095
--- /dev/null
+++ b/tools/testing/selftests/net/rds/config.sh
@@ -0,0 +1,53 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+set -u
+set -x
+
+unset KBUILD_OUTPUT
+
+GENERATE_GCOV_REPORT=0
+while getopts "g" opt; do
+  case ${opt} in
+    g)
+      GENERATE_GCOV_REPORT=1
+      ;;
+    :)
+      echo "USAGE: config.sh [-g]"
+      exit 1
+      ;;
+    ?)
+      echo "Invalid option: -${OPTARG}."
+      exit 1
+      ;;
+  esac
+done
+
+CONF_FILE="tools/testing/selftests/net/config"
+
+# no modules
+scripts/config --file "$CONF_FILE" --disable CONFIG_MODULES
+
+# enable RDS
+scripts/config --file "$CONF_FILE" --enable CONFIG_RDS
+scripts/config --file "$CONF_FILE" --enable CONFIG_RDS_TCP
+
+if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then
+	# instrument RDS and only RDS
+	scripts/config --file "$CONF_FILE" --enable CONFIG_GCOV_KERNEL
+	scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_ALL
+	scripts/config --file "$CONF_FILE" --enable GCOV_PROFILE_RDS
+else
+	scripts/config --file "$CONF_FILE" --disable CONFIG_GCOV_KERNEL
+	scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_ALL
+	scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_RDS
+fi
+
+# need network namespaces to run tests with veth network interfaces
+scripts/config --file "$CONF_FILE" --enable CONFIG_NET_NS
+scripts/config --file "$CONF_FILE" --enable CONFIG_VETH
+
+# simulate packet loss
+scripts/config --file "$CONF_FILE" --enable CONFIG_NET_SCH_NETEM
+
diff --git a/tools/testing/selftests/net/rds/run.sh b/tools/testing/selftests/net/rds/run.sh
new file mode 100755
index 000000000000..8aee244f582a
--- /dev/null
+++ b/tools/testing/selftests/net/rds/run.sh
@@ -0,0 +1,224 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+set -u
+
+unset KBUILD_OUTPUT
+
+current_dir="$(realpath "$(dirname "$0")")"
+build_dir="$current_dir"
+
+build_include="$current_dir/include.sh"
+if test -f "$build_include"; then
+	# this include will define "$mk_build_dir" as the location the test was
+	# built.  We will need this if the tests are installed in a location
+	# other than the kernel source
+
+	source "$build_include"
+	build_dir="$mk_build_dir"
+fi
+
+# This test requires kernel source and the *.gcda data therein
+# Locate the top level of the kernel source, and the net/rds
+# subfolder with the appropriate *.gcno object files
+ksrc_dir="$(realpath "$build_dir"/../../../../../)"
+kconfig="$ksrc_dir/.config"
+obj_dir="$ksrc_dir/net/rds"
+
+GCOV_CMD=gcov
+
+#check to see if the host has the required packages to generate a gcov report
+check_gcov_env()
+{
+	if ! which "$GCOV_CMD" > /dev/null 2>&1; then
+		echo "Warning: Could not find gcov. "
+		GENERATE_GCOV_REPORT=0
+		return
+	fi
+
+	# the gcov version must match the gcc version
+	GCC_VER=$(gcc -dumpfullversion)
+	GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| awk 'BEGIN {FS="-"}{print $1}')
+	if [ "$GCOV_VER" != "$GCC_VER" ]; then
+		#attempt to find a matching gcov version
+		GCOV_CMD=gcov-$(gcc -dumpversion)
+
+		if ! which "$GCOV_CMD" > /dev/null 2>&1; then
+			echo "Warning: Could not find an appropriate gcov installation. \
+				gcov version must match gcc version"
+			GENERATE_GCOV_REPORT=0
+			return
+		fi
+
+		#recheck version number of found gcov executable
+		GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| \
+			awk 'BEGIN {FS="-"}{print $1}')
+		if [ "$GCOV_VER" != "$GCC_VER" ]; then
+			echo "Warning: Could not find an appropriate gcov installation. \
+				gcov version must match gcc version"
+			GENERATE_GCOV_REPORT=0
+		else
+			echo "Warning: Mismatched gcc and gcov detected.  Using $GCOV_CMD"
+		fi
+	fi
+}
+
+# Check to see if the kconfig has the required configs to generate a coverage report
+check_gcov_conf()
+{
+	if ! grep -x "CONFIG_GCOV_PROFILE_RDS=y" "$kconfig" > /dev/null 2>&1; then
+		echo "INFO: CONFIG_GCOV_PROFILE_RDS should be enabled for coverage reports"
+		GENERATE_GCOV_REPORT=0
+	fi
+	if ! grep -x "CONFIG_GCOV_KERNEL=y" "$kconfig" > /dev/null 2>&1; then
+		echo "INFO: CONFIG_GCOV_KERNEL should be enabled for coverage reports"
+		GENERATE_GCOV_REPORT=0
+	fi
+	if grep -x "CONFIG_GCOV_PROFILE_ALL=y" "$kconfig" > /dev/null 2>&1; then
+		echo "INFO: CONFIG_GCOV_PROFILE_ALL should be disabled for coverage reports"
+		GENERATE_GCOV_REPORT=0
+	fi
+
+	if [ "$GENERATE_GCOV_REPORT" -eq 0 ]; then
+		echo "To enable gcov reports, please run "\
+			"\"tools/testing/selftests/net/rds/config.sh -g\" and rebuild the kernel"
+	else
+		# if we have the required kernel configs, proceed to check the environment to
+		# ensure we have the required gcov packages
+		check_gcov_env
+	fi
+}
+
+# Kselftest framework requirement - SKIP code is 4.
+check_conf_enabled() {
+	if ! grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then
+		echo "selftests: [SKIP] This test requires $1 enabled"
+		echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel"
+		exit 4
+	fi
+}
+check_conf_disabled() {
+	if grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then
+		echo "selftests: [SKIP] This test requires $1 disabled"
+		echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel"
+		exit 4
+	fi
+}
+check_conf() {
+	check_conf_enabled CONFIG_NET_SCH_NETEM
+	check_conf_enabled CONFIG_VETH
+	check_conf_enabled CONFIG_NET_NS
+	check_conf_enabled CONFIG_RDS_TCP
+	check_conf_enabled CONFIG_RDS
+	check_conf_disabled CONFIG_MODULES
+}
+
+check_env()
+{
+	if ! test -d "$obj_dir"; then
+		echo "selftests: [SKIP] This test requires a kernel source tree"
+		exit 4
+	fi
+	if ! test -e "$kconfig"; then
+		echo "selftests: [SKIP] This test requires a configured kernel source tree"
+		exit 4
+	fi
+	if ! which strace > /dev/null 2>&1; then
+		echo "selftests: [SKIP] Could not run test without strace"
+		exit 4
+	fi
+	if ! which tcpdump > /dev/null 2>&1; then
+		echo "selftests: [SKIP] Could not run test without tcpdump"
+		exit 4
+	fi
+
+	if ! which python3 > /dev/null 2>&1; then
+		echo "selftests: [SKIP] Could not run test without python3"
+		exit 4
+	fi
+
+	python_major=$(python3 -c "import sys; print(sys.version_info[0])")
+	python_minor=$(python3 -c "import sys; print(sys.version_info[1])")
+	if [[ python_major -lt 3 || ( python_major -eq 3 && python_minor -lt 9 ) ]] ; then
+		echo "selftests: [SKIP] Could not run test without at least python3.9"
+		python3 -V
+		exit 4
+	fi
+}
+
+LOG_DIR="$current_dir"/rds_logs
+PLOSS=0
+PCORRUPT=0
+PDUP=0
+GENERATE_GCOV_REPORT=1
+while getopts "d:l:c:u:" opt; do
+  case ${opt} in
+    d)
+      LOG_DIR=${OPTARG}
+      ;;
+    l)
+      PLOSS=${OPTARG}
+      ;;
+    c)
+      PCORRUPT=${OPTARG}
+      ;;
+    u)
+      PDUP=${OPTARG}
+      ;;
+    :)
+      echo "USAGE: run.sh [-d logdir] [-l packet_loss] [-c packet_corruption]" \
+           "[-u packet_duplcate] [-g]"
+      exit 1
+      ;;
+    ?)
+      echo "Invalid option: -${OPTARG}."
+      exit 1
+      ;;
+  esac
+done
+
+
+check_env
+check_conf
+check_gcov_conf
+
+
+rm -fr "$LOG_DIR"
+TRACE_FILE="${LOG_DIR}/rds-strace.txt"
+COVR_DIR="${LOG_DIR}/coverage/"
+mkdir -p  "$LOG_DIR"
+mkdir -p "$COVR_DIR"
+
+set +e
+echo running RDS tests...
+echo Traces will be logged to "$TRACE_FILE"
+rm -f "$TRACE_FILE"
+strace -T -tt -o "$TRACE_FILE" python3 "$(dirname "$0")/test.py" --timeout 400 -d "$LOG_DIR" \
+       -l "$PLOSS" -c "$PCORRUPT" -u "$PDUP"
+
+test_rc=$?
+dmesg > "${LOG_DIR}/dmesg.out"
+
+if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then
+       echo saving coverage data...
+       (set +x; cd /sys/kernel/debug/gcov; find ./* -name '*.gcda' | \
+       while read -r f
+       do
+               cat < "/sys/kernel/debug/gcov/$f" > "/$f"
+       done)
+
+       echo running gcovr...
+       gcovr -s --html-details --gcov-executable "$GCOV_CMD" --gcov-ignore-parse-errors \
+             -o "${COVR_DIR}/gcovr" "${ksrc_dir}/net/rds/"
+else
+       echo "Coverage report will be skipped"
+fi
+
+if [ "$test_rc" -eq 0 ]; then
+	echo "PASS: Test completed successfully"
+else
+	echo "FAIL: Test failed"
+fi
+
+exit "$test_rc"
diff --git a/tools/testing/selftests/net/rds/test.py b/tools/testing/selftests/net/rds/test.py
new file mode 100644
index 000000000000..e6bb109bcead
--- /dev/null
+++ b/tools/testing/selftests/net/rds/test.py
@@ -0,0 +1,262 @@
+#! /usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import argparse
+import ctypes
+import errno
+import hashlib
+import os
+import select
+import signal
+import socket
+import subprocess
+import sys
+import atexit
+from pwd import getpwuid
+from os import stat
+from lib.py import ip
+
+
+libc = ctypes.cdll.LoadLibrary('libc.so.6')
+setns = libc.setns
+
+net0 = 'net0'
+net1 = 'net1'
+
+veth0 = 'veth0'
+veth1 = 'veth1'
+
+# Helper function for creating a socket inside a network namespace.
+# We need this because otherwise RDS will detect that the two TCP
+# sockets are on the same interface and use the loop transport instead
+# of the TCP transport.
+def netns_socket(netns, *args):
+    u0, u1 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET)
+
+    child = os.fork()
+    if child == 0:
+        # change network namespace
+        with open(f'/var/run/netns/{netns}') as f:
+            try:
+                ret = setns(f.fileno(), 0)
+            except IOError as e:
+                print(e.errno)
+                print(e)
+
+        # create socket in target namespace
+        s = socket.socket(*args)
+
+        # send resulting socket to parent
+        socket.send_fds(u0, [], [s.fileno()])
+
+        sys.exit(0)
+
+    # receive socket from child
+    _, s, _, _ = socket.recv_fds(u1, 0, 1)
+    os.waitpid(child, 0)
+    u0.close()
+    u1.close()
+    return socket.fromfd(s[0], *args)
+
+def signal_handler(sig, frame):
+    print('Test timed out')
+    sys.exit(1)
+
+#Parse out command line arguments.  We take an optional
+# timeout parameter and an optional log output folder
+parser = argparse.ArgumentParser(description="init script args",
+                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument("-d", "--logdir", action="store",
+                    help="directory to store logs", default="/tmp")
+parser.add_argument('--timeout', help="timeout to terminate hung test",
+                    type=int, default=0)
+parser.add_argument('-l', '--loss', help="Simulate tcp packet loss",
+                    type=int, default=0)
+parser.add_argument('-c', '--corruption', help="Simulate tcp packet corruption",
+                    type=int, default=0)
+parser.add_argument('-u', '--duplicate', help="Simulate tcp packet duplication",
+                    type=int, default=0)
+args = parser.parse_args()
+logdir=args.logdir
+packet_loss=str(args.loss)+'%'
+packet_corruption=str(args.corruption)+'%'
+packet_duplicate=str(args.duplicate)+'%'
+
+ip(f"netns add {net0}")
+ip(f"netns add {net1}")
+ip(f"link add type veth")
+
+addrs = [
+    # we technically don't need different port numbers, but this will
+    # help identify traffic in the network analyzer
+    ('10.0.0.1', 10000),
+    ('10.0.0.2', 20000),
+]
+
+# move interfaces to separate namespaces so they can no longer be
+# bound directly; this prevents rds from switching over from the tcp
+# transport to the loop transport.
+ip(f"link set {veth0} netns {net0} up")
+ip(f"link set {veth1} netns {net1} up")
+
+
+
+# add addresses
+ip(f"-n {net0} addr add {addrs[0][0]}/32 dev {veth0}")
+ip(f"-n {net1} addr add {addrs[1][0]}/32 dev {veth1}")
+
+# add routes
+ip(f"-n {net0} route add {addrs[1][0]}/32 dev {veth0}")
+ip(f"-n {net1} route add {addrs[0][0]}/32 dev {veth1}")
+
+# sanity check that our two interfaces/addresses are correctly set up
+# and communicating by doing a single ping
+ip(f"netns exec {net0} ping -c 1 {addrs[1][0]}")
+
+# Start a packet capture on each network
+for net in [net0, net1]:
+    tcpdump_pid = os.fork()
+    if tcpdump_pid == 0:
+        pcap = logdir+'/'+net+'.pcap'
+        subprocess.check_call(['touch', pcap])
+        user = getpwuid(stat(pcap).st_uid).pw_name
+        ip(f"netns exec {net} /usr/sbin/tcpdump -Z {user} -i any -w {pcap}")
+        sys.exit(0)
+
+# simulate packet loss, duplication and corruption
+for net, iface in [(net0, veth0), (net1, veth1)]:
+    ip(f"netns exec {net} /usr/sbin/tc qdisc add dev {iface} root netem  \
+         corrupt {packet_corruption} loss {packet_loss} duplicate  \
+         {packet_duplicate}")
+
+# add a timeout
+if args.timeout > 0:
+    signal.alarm(args.timeout)
+    signal.signal(signal.SIGALRM, signal_handler)
+
+sockets = [
+    netns_socket(net0, socket.AF_RDS, socket.SOCK_SEQPACKET),
+    netns_socket(net1, socket.AF_RDS, socket.SOCK_SEQPACKET),
+]
+
+for s, addr in zip(sockets, addrs):
+    s.bind(addr)
+    s.setblocking(0)
+
+fileno_to_socket = {
+    s.fileno(): s for s in sockets
+}
+
+addr_to_socket = {
+    addr: s for addr, s in zip(addrs, sockets)
+}
+
+socket_to_addr = {
+    s: addr for addr, s in zip(addrs, sockets)
+}
+
+send_hashes = {}
+recv_hashes = {}
+
+ep = select.epoll()
+
+for s in sockets:
+    ep.register(s, select.EPOLLRDNORM)
+
+n = 50000
+nr_send = 0
+nr_recv = 0
+
+while nr_send < n:
+    # Send as much as we can without blocking
+    print("sending...", nr_send, nr_recv)
+    while nr_send < n:
+        send_data = hashlib.sha256(
+            f'packet {nr_send}'.encode('utf-8')).hexdigest().encode('utf-8')
+
+        # pseudo-random send/receive pattern
+        sender = sockets[nr_send % 2]
+        receiver = sockets[1 - (nr_send % 3) % 2]
+
+        try:
+            sender.sendto(send_data, socket_to_addr[receiver])
+            send_hashes.setdefault((sender.fileno(), receiver.fileno()),
+                    hashlib.sha256()).update(f'<{send_data}>'.encode('utf-8'))
+            nr_send = nr_send + 1
+        except BlockingIOError as e:
+            break
+        except OSError as e:
+            if e.errno in [errno.ENOBUFS, errno.ECONNRESET, errno.EPIPE]:
+                break
+            raise
+
+    # Receive as much as we can without blocking
+    print("receiving...", nr_send, nr_recv)
+    while nr_recv < nr_send:
+        for fileno, eventmask in ep.poll():
+            receiver = fileno_to_socket[fileno]
+
+            if eventmask & select.EPOLLRDNORM:
+                while True:
+                    try:
+                        recv_data, address = receiver.recvfrom(1024)
+                        sender = addr_to_socket[address]
+                        recv_hashes.setdefault((sender.fileno(),
+                            receiver.fileno()), hashlib.sha256()).update(
+                                    f'<{recv_data}>'.encode('utf-8'))
+                        nr_recv = nr_recv + 1
+                    except BlockingIOError as e:
+                        break
+
+    # exercise net/rds/tcp.c:rds_tcp_sysctl_reset()
+    for net in [net0, net1]:
+        ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_rcvbuf=10000")
+        ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_sndbuf=10000")
+
+print("done", nr_send, nr_recv)
+
+# the Python socket module doesn't know these
+RDS_INFO_FIRST = 10000
+RDS_INFO_LAST = 10017
+
+nr_success = 0
+nr_error = 0
+
+for s in sockets:
+    for optname in range(RDS_INFO_FIRST, RDS_INFO_LAST + 1):
+        # Sigh, the Python socket module doesn't allow us to pass
+        # buffer lengths greater than 1024 for some reason. RDS
+        # wants multiple pages.
+        try:
+            s.getsockopt(socket.SOL_RDS, optname, 1024)
+            nr_success = nr_success + 1
+        except OSError as e:
+            nr_error = nr_error + 1
+            if e.errno == errno.ENOSPC:
+                # ignore
+                pass
+
+print(f"getsockopt(): {nr_success}/{nr_error}")
+
+print("Stopping network packet captures")
+subprocess.check_call(['killall', '-q', 'tcpdump'])
+
+# We're done sending and receiving stuff, now let's check if what
+# we received is what we sent.
+for (sender, receiver), send_hash in send_hashes.items():
+    recv_hash = recv_hashes.get((sender, receiver))
+
+    if recv_hash is None:
+        print("FAIL: No data received")
+        sys.exit(1)
+
+    if send_hash.hexdigest() != recv_hash.hexdigest():
+        print("FAIL: Send/recv mismatch")
+        print("hash expected:", send_hash.hexdigest())
+        print("hash received:", recv_hash.hexdigest())
+        sys.exit(1)
+
+    print(f"{sender}/{receiver}: ok")
+
+print("Success")
+sys.exit(0)
-- 
cgit v1.2.3


From 05673c42f73965c6381ab986b4dd2145700e6a0c Mon Sep 17 00:00:00 2001
From: Zixian Cai <fzczx123@gmail.com>
Date: Fri, 9 Aug 2024 08:01:36 +0000
Subject: perf script python: Add the 'ins_lat' field to event handler

For example, when using the Alder Lake PMU memory load event, the
instruction latency is stored in 'ins_lat', while the cache latency
is stored in 'weight'.

This patch reports the 'ins_lat' field for Python scripting.

Committer testing:

On a Rocket Lake Refresh Intel machine (14th gen):

  root@number:~# grep -m1 'model name' /proc/cpuinfo
  model name	: Intel(R) Core(TM) i7-14700K
  root@number:~# perf mem record -a sleep 5
  Memory events are enabled on a subset of CPUs: 16-27
  [ perf record: Woken up 85 times to write data ]
  [ perf record: Captured and wrote 41.236 MB perf.data (191390 samples) ]
  root@number:~# perf evlist -v
  cpu_atom/mem-loads,ldlat=30/P: type: 10 (cpu_atom), size: 136, config: 0x5d0 (mem-loads), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1, { bp_addr, config1 }: 0x1f
  cpu_atom/mem-stores/P: type: 10 (cpu_atom), size: 136, config: 0x6d0 (mem-stores), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  dummy:u: type: 1 (software), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|ADDR|CPU|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
  root@number:~#

Now generate a python script to then dump the dictionary that now needs
to have that 'ins_lat' field:

  root@number:~# perf script --gen python
  generated Python script: perf-script.py
  root@number:~# vim perf-script.py
  root@number:~# perf script -s perf-script.py | head -40
  in trace_begin
  in trace_end
  root@number:~# vim perf-script.py

Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Zixian Cai <fzczx123@gmail.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paran Lee <p4ranlee@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240809080137.3590148-1-fzczx123@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/scripting-engines/trace-event-python.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index fb00f3ad6815..6971dd6c231f 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -888,6 +888,8 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 	set_sample_read_in_dict(dict_sample, sample, evsel);
 	pydict_set_item_string_decref(dict_sample, "weight",
 			PyLong_FromUnsignedLongLong(sample->weight));
+	pydict_set_item_string_decref(dict_sample, "ins_lat",
+			PyLong_FromUnsignedLong(sample->ins_lat));
 	pydict_set_item_string_decref(dict_sample, "transaction",
 			PyLong_FromUnsignedLongLong(sample->transaction));
 	set_sample_datasrc_in_dict(dict_sample, sample);
@@ -1317,7 +1319,7 @@ static void python_export_sample_table(struct db_export *dbe,
 	struct tables *tables = container_of(dbe, struct tables, dbe);
 	PyObject *t;
 
-	t = tuple_new(27);
+	t = tuple_new(28);
 
 	tuple_set_d64(t, 0, es->db_id);
 	tuple_set_d64(t, 1, es->evsel->db_id);
@@ -1346,6 +1348,7 @@ static void python_export_sample_table(struct db_export *dbe,
 	tuple_set_s32(t, 24, es->sample->flags);
 	tuple_set_d64(t, 25, es->sample->id);
 	tuple_set_d64(t, 26, es->sample->stream_id);
+	tuple_set_u32(t, 27, es->sample->ins_lat);
 
 	call_object(tables->sample_handler, t, "sample_table");
 
-- 
cgit v1.2.3


From 76fb981ad6774b82f06703c896b492c8659b543b Mon Sep 17 00:00:00 2001
From: Aboorva Devarajan <aboorvad@linux.ibm.com>
Date: Fri, 9 Aug 2024 14:07:28 +0530
Subject: tools/cpupower: display residency value in idle-info

Update cpuidle tool to display the residency value of cpuidle
states. This addition provides a clearer and more detailed view
of idle state information when using cpuidle-info.

 --------------------------------
 Before Patch:
 --------------------------------
 $ cpupower idle-info
 CPUidle driver: intel_idle
 CPUidle governor: menu
 analyzing CPU 28:

 Number of idle states: 3
 Available idle states: POLL C1 C1E
 POLL:
 Flags/Description: CPUIDLE CORE POLL IDLE
 Latency: 0
 Usage: 7448
 Duration: 207170
 C1:
 Flags/Description: MWAIT 0x00
 Latency: 2
 Usage: 7023
 Duration: 3736853
 C1E:
 Flags/Description: MWAIT 0x01
 Latency: 10
 Usage: 18468
 Duration: 11396212

 --------------------------------
 After Patch:
 --------------------------------
 $ cpupower idle-info
 CPUidle driver: intel_idle
 CPUidle governor: menu
 analyzing CPU 12:

 Number of idle states: 3
 Available idle states: POLL C1 C1E
 POLL:
 Flags/Description: CPUIDLE CORE POLL IDLE
 Latency: 0
 Residency: 0
 Usage: 1950
 Duration: 38458
 C1:
 Flags/Description: MWAIT 0x00
 Latency: 2
 Residency: 2
 Usage: 10688
 Duration: 7133020
 C1E:
 Flags/Description: MWAIT 0x01
 Latency: 10
 Residency: 20
 Usage: 22356
 Duration: 15687259
 --------------------------------

Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/lib/cpuidle.c        | 8 ++++++++
 tools/power/cpupower/lib/cpuidle.h        | 2 ++
 tools/power/cpupower/utils/cpuidle-info.c | 4 ++++
 3 files changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c
index 479c5971aa6d..0ecac009273c 100644
--- a/tools/power/cpupower/lib/cpuidle.c
+++ b/tools/power/cpupower/lib/cpuidle.c
@@ -116,6 +116,7 @@ enum idlestate_value {
 	IDLESTATE_USAGE,
 	IDLESTATE_POWER,
 	IDLESTATE_LATENCY,
+	IDLESTATE_RESIDENCY,
 	IDLESTATE_TIME,
 	IDLESTATE_DISABLE,
 	MAX_IDLESTATE_VALUE_FILES
@@ -125,6 +126,7 @@ static const char *idlestate_value_files[MAX_IDLESTATE_VALUE_FILES] = {
 	[IDLESTATE_USAGE] = "usage",
 	[IDLESTATE_POWER] = "power",
 	[IDLESTATE_LATENCY] = "latency",
+	[IDLESTATE_RESIDENCY] = "residency",
 	[IDLESTATE_TIME]  = "time",
 	[IDLESTATE_DISABLE]  = "disable",
 };
@@ -254,6 +256,12 @@ unsigned long cpuidle_state_latency(unsigned int cpu,
 	return cpuidle_state_get_one_value(cpu, idlestate, IDLESTATE_LATENCY);
 }
 
+unsigned long cpuidle_state_residency(unsigned int cpu,
+					  unsigned int idlestate)
+{
+	return cpuidle_state_get_one_value(cpu, idlestate, IDLESTATE_RESIDENCY);
+}
+
 unsigned long cpuidle_state_usage(unsigned int cpu,
 					unsigned int idlestate)
 {
diff --git a/tools/power/cpupower/lib/cpuidle.h b/tools/power/cpupower/lib/cpuidle.h
index 2e10fead2e1e..2ab404d40259 100644
--- a/tools/power/cpupower/lib/cpuidle.h
+++ b/tools/power/cpupower/lib/cpuidle.h
@@ -8,6 +8,8 @@ int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate,
 				   unsigned int disable);
 unsigned long cpuidle_state_latency(unsigned int cpu,
 						unsigned int idlestate);
+unsigned long cpuidle_state_residency(unsigned int cpu,
+						unsigned int idlestate);
 unsigned long cpuidle_state_usage(unsigned int cpu,
 					unsigned int idlestate);
 unsigned long long cpuidle_state_time(unsigned int cpu,
diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c
index 44126a87fa7a..e0d17f0de3fe 100644
--- a/tools/power/cpupower/utils/cpuidle-info.c
+++ b/tools/power/cpupower/utils/cpuidle-info.c
@@ -64,6 +64,8 @@ static void cpuidle_cpu_output(unsigned int cpu, int verbose)
 
 		printf(_("Latency: %lu\n"),
 		       cpuidle_state_latency(cpu, idlestate));
+		printf(_("Residency: %lu\n"),
+		       cpuidle_state_residency(cpu, idlestate));
 		printf(_("Usage: %lu\n"),
 		       cpuidle_state_usage(cpu, idlestate));
 		printf(_("Duration: %llu\n"),
@@ -115,6 +117,8 @@ static void proc_cpuidle_cpu_output(unsigned int cpu)
 		printf(_("promotion[--] demotion[--] "));
 		printf(_("latency[%03lu] "),
 		       cpuidle_state_latency(cpu, cstate));
+		printf(_("residency[%05lu] "),
+		       cpuidle_state_residency(cpu, cstate));
 		printf(_("usage[%08lu] "),
 		       cpuidle_state_usage(cpu, cstate));
 		printf(_("duration[%020Lu] \n"),
-- 
cgit v1.2.3


From 13d675aea6cac5bb4619ef9e3fdd90129fe6d139 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 9 Aug 2024 11:32:45 -0300
Subject: perf debuginfo: Fix the build with !HAVE_DWARF_SUPPORT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In that case we have a set of placeholder functions, one of them uses a
'Dwarf_Addr' type that is not present as it is defined in the missing
DWARF libraries, so provide a placeholder typedef for that as well.

The build error before this patch:

  In file included from util/annotate.c:28:
  util/debuginfo.h:44:46: error: unknown type name ‘Dwarf_Addr’
     44 |                                              Dwarf_Addr *offs __maybe_unused,
        |                                              ^~~~~~~~~~
  make[6]: *** [/home/acme/git/perf-tools-next/tools/build/Makefile.build:106: util/annotate.o] Error 1
  make[6]: *** Waiting for unfinished jobs....

Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Link: https://lore.kernel.org/lkml/CAM9d7ciushSwEfj7yW4rtDEJBTcCB991V4cswwFEL+cv6QF2pg@mail.gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/debuginfo.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/debuginfo.h b/tools/perf/util/debuginfo.h
index 4d65b8c605fc..ad6422c3f8ca 100644
--- a/tools/perf/util/debuginfo.h
+++ b/tools/perf/util/debuginfo.h
@@ -40,6 +40,8 @@ static inline void debuginfo__delete(struct debuginfo *dbg __maybe_unused)
 {
 }
 
+typedef void Dwarf_Addr;
+
 static inline int debuginfo__get_text_offset(struct debuginfo *dbg __maybe_unused,
 					     Dwarf_Addr *offs __maybe_unused,
 					     bool adjust_offset __maybe_unused)
-- 
cgit v1.2.3


From 890a1961c812db801e0f9dfaa4af233aa3c6ab63 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 7 Aug 2024 16:18:20 -0700
Subject: perf tools: Create source symlink in perf object dir
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a source symlink to the original source in the objdir.

This is similar to what the main kernel build script does.

Committer testing:

  ⬢[acme@toolbox perf-tools-next]$ make O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin
  <SNIP>
  ⬢[acme@toolbox perf-tools-next]$ ls -la /tmp/build/perf-tools-next/source
  lrwxrwxrwx. 1 acme acme 41 Aug  9 16:26 /tmp/build/perf-tools-next/source -> /home/acme/git/perf-tools-next/tools/perf
  ⬢[acme@toolbox perf-tools-next]$

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240807231823.898979-1-ak@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 72534bb72d43..9731f5e84131 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -163,6 +163,8 @@ ifneq ($(OUTPUT),)
 # for flex/bison parsers.
 VPATH += $(OUTPUT)
 export VPATH
+# create symlink to the original source
+SOURCE := $(shell ln -sf $(srctree)/tools/perf $(OUTPUT)/source)
 endif
 
 ifeq ($(V),1)
-- 
cgit v1.2.3


From 336989d00f2140c7524b76126aa62c8a47a5a1d3 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 6 Aug 2024 23:15:55 -0700
Subject: perf annotate: Fix --group behavior when leader has no samples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When --group option is used, it should display all events together.  But
the current logic only checks if the first (leader) event has samples or
not.  Let's check the member events as well.

Also it missed to put the linked samples from member evsels to the
output RB-tree so that it can be displayed in the output.

For example, take a look at this example.

  $ ./perf evlist
  cpu/mem-loads,ldlat=30/P
  cpu/mem-stores/P
  dummy:u

It has three events but 'path_put' function has samples only for
mem-stores (second) event.

  $ sudo ./perf annotate --stdio -f path_put
   Percent |      Source code & Disassembly of kcore for cpu/mem-stores/P (2 samples, percent: local period)
  ----------------------------------------------------------------------------------------------------------
           : 0                0xffffffffae600020 <path_put>:
      0.00 :   ffffffffae600020:       endbr64
      0.00 :   ffffffffae600024:       nopl    (%rax, %rax)
     91.22 :   ffffffffae600029:       pushq   %rbx
      0.00 :   ffffffffae60002a:       movq    %rdi, %rbx
      0.00 :   ffffffffae60002d:       movq    8(%rdi), %rdi
      8.78 :   ffffffffae600031:       callq   0xffffffffae614aa0
      0.00 :   ffffffffae600036:       movq    (%rbx), %rdi
      0.00 :   ffffffffae600039:       popq    %rbx
      0.00 :   ffffffffae60003a:       jmp     0xffffffffae620670
      0.00 :   ffffffffae60003f:       nop

Therefore, it didn't show up when --group option is used since the
leader ("mem-loads") event has no samples.  But now it checks both
events.

Before:
  $ sudo ./perf annotate --stdio -f --group path_put
  (no output)

After:
  $ sudo ./perf annotate --stdio -f --group path_put
   Percent                 |      Source code & Disassembly of kcore for cpu/mem-loads,ldlat=30/P, cpu/mem-stores/P, dummy:u (0 samples, percent: local period)
  -------------------------------------------------------------------------------------------------------------------------------------------------------------
                           : 0                0xffffffffae600020 <path_put>:
      0.00    0.00    0.00 :   ffffffffae600020:       endbr64
      0.00    0.00    0.00 :   ffffffffae600024:       nopl    (%rax, %rax)
      0.00   91.22    0.00 :   ffffffffae600029:       pushq   %rbx
      0.00    0.00    0.00 :   ffffffffae60002a:       movq    %rdi, %rbx
      0.00    0.00    0.00 :   ffffffffae60002d:       movq    8(%rdi), %rdi
      0.00    8.78    0.00 :   ffffffffae600031:       callq   0xffffffffae614aa0
      0.00    0.00    0.00 :   ffffffffae600036:       movq    (%rbx), %rdi
      0.00    0.00    0.00 :   ffffffffae600039:       popq    %rbx
      0.00    0.00    0.00 :   ffffffffae60003a:       jmp     0xffffffffae620670
      0.00    0.00    0.00 :   ffffffffae60003f:       nop

Committer testing:

Before:

  root@number:~# perf annotate --group --stdio2 clear_page_erms
  root@number:~#

After:

  root@number:~# perf annotate --group --stdio2 clear_page_erms
  Samples: 125  of events 'cpu_atom/mem-loads,ldlat=30/P, cpu_atom/mem-stores/P, dummy:u', 4000 Hz, Event count (approx.): 13198416, [percent: local period]
  clear_page_erms() /proc/kcore
  Percent                      0xffffffff990c6cc0 <clear_page_erms>:
                                 endbr64
                                 movl    $0x1000,%ecx
                                 xorl    %eax,%eax
     0.00  100.00    0.00        rep     stosb %al, (%rdi)
                               ← retq
                                 int3
                                 int3
                                 int3
                                 int3
                                 nop
                                 nop
  root@number:~#

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20240807061555.1642669-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index efcadb7620b8..1bfe41783a7c 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -632,13 +632,23 @@ static int __cmd_annotate(struct perf_annotate *ann)
 	evlist__for_each_entry(session->evlist, pos) {
 		struct hists *hists = evsel__hists(pos);
 		u32 nr_samples = hists->stats.nr_samples;
+		struct ui_progress prog;
+		struct evsel *evsel;
 
-		if (nr_samples == 0)
+		if (!symbol_conf.event_group || !evsel__is_group_leader(pos))
 			continue;
 
-		if (!symbol_conf.event_group || !evsel__is_group_leader(pos))
+		for_each_group_member(evsel, pos)
+			nr_samples += evsel__hists(evsel)->stats.nr_samples;
+
+		if (nr_samples == 0)
 			continue;
 
+		ui_progress__init(&prog, nr_samples,
+				  "Sorting group events for output...");
+		evsel__output_resort(pos, &prog);
+		ui_progress__finish();
+
 		hists__find_annotations(hists, pos, ann);
 	}
 
-- 
cgit v1.2.3


From cb1898f58e0f175d168a5a8c5a269a4d24cbbef5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 6 Aug 2024 23:17:13 -0700
Subject: perf annotate-data: Support --skip-empty option

The --skip-empty option is to hide dummy events in a group.  Like other
output mode in 'perf report' and 'perf annotate', the data-type
profiling output should support the option.

Committer testing:

With dummy:

  root@number:~# perf annotate --stdio --group --data-type --skip-empty | head -24
  Annotate type: 'pthread_mutex_t' in /usr/lib64/libc.so.6 (50 samples):
   event[0] = cpu_atom/mem-loads,ldlat=30/P
   event[1] = cpu_atom/mem-stores/P
   event[2] = dummy:u
  ============================================================================
                   Percent     offset       size  field
    100.00  100.00    0.00          0         40  pthread_mutex_t	 {
    100.00  100.00    0.00          0         40      struct __pthread_mutex_s	__data {
     45.21   84.54    0.00          0          4          int	__lock;
      0.00    0.00    0.00          4          4          unsigned int	__count;
      0.00    1.83    0.00          8          4          int	__owner;
      5.19   10.65    0.00         12          4          unsigned int	__nusers;
     49.61    2.97    0.00         16          4          int	__kind;
      0.00    0.00    0.00         20          2          short int	__spins;
      0.00    0.00    0.00         22          2          short int	__elision;
      0.00    0.00    0.00         24         16          __pthread_list_t	__list {
      0.00    0.00    0.00         24          8              struct __pthread_internal_list*	__prev;
      0.00    0.00    0.00         32          8              struct __pthread_internal_list*	__next;
                                                          };
                                                      };
      0.00    0.00    0.00          0          0      char[]	__size;
     45.21   84.54    0.00          0          8      long int	__align;
                                                };
Skipping it:

  root@number:~# perf annotate --stdio --group --data-type --skip-empty | head -24
  Annotate type: 'pthread_mutex_t' in /usr/lib64/libc.so.6 (50 samples):
   event[0] = cpu_atom/mem-loads,ldlat=30/P
   event[1] = cpu_atom/mem-stores/P
  ============================================================================
           Percent     offset       size  field
    100.00  100.00          0         40  pthread_mutex_t	 {
    100.00  100.00          0         40      struct __pthread_mutex_s	__data {
     45.21   84.54          0          4          int	__lock;
      0.00    0.00          4          4          unsigned int	__count;
      0.00    1.83          8          4          int	__owner;
      5.19   10.65         12          4          unsigned int	__nusers;
     49.61    2.97         16          4          int	__kind;
      0.00    0.00         20          2          short int	__spins;
      0.00    0.00         22          2          short int	__elision;
      0.00    0.00         24         16          __pthread_list_t	__list {
      0.00    0.00         24          8              struct __pthread_internal_list*	__prev;
      0.00    0.00         32          8              struct __pthread_internal_list*	__next;
                                                  };
                                              };
      0.00    0.00          0          0      char[]	__size;
     45.21   84.54          0          8      long int	__align;
                                          };

  Annotate type: 'pthread_mutexattr_t' in /usr/lib64/libc.so.6 (1 samples):
  root@number:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240807061713.1642924-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate-data.c | 30 +++++++++++++++++------
 tools/perf/util/annotate-data.c        | 44 +++++++++++++++++-----------------
 2 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c
index 8d6bf08d371d..c3db80a7589a 100644
--- a/tools/perf/ui/browsers/annotate-data.c
+++ b/tools/perf/ui/browsers/annotate-data.c
@@ -46,15 +46,18 @@ static int get_member_overhead(struct annotated_data_type *adt,
 	struct annotated_member *member = entry->data;
 	int i, k;
 
-	for (i = 0; i < member->size; i++) {
+	for (i = 0, k = 0; i < member->size; i++) {
 		struct type_hist *h;
 		struct evsel *evsel;
 		int offset = member->offset + i;
 
 		for_each_group_evsel(evsel, leader) {
+			if (symbol_conf.skip_empty &&
+			    evsel__hists(evsel)->stats.nr_samples == 0)
+				continue;
+
 			h = adt->histograms[evsel->core.idx];
-			k = evsel__group_idx(evsel);
-			update_hist_entry(&entry->hists[k], &h->addr[offset]);
+			update_hist_entry(&entry->hists[k++], &h->addr[offset]);
 		}
 	}
 	return 0;
@@ -203,6 +206,7 @@ static void browser__write(struct ui_browser *uib, void *entry, int row)
 	struct annotated_data_type *adt = he->mem_type;
 	struct evsel *leader = hists_to_evsel(he->hists);
 	struct evsel *evsel;
+	int idx = 0;
 
 	if (member == NULL) {
 		bool current = ui_browser__is_current_entry(uib, row);
@@ -219,9 +223,12 @@ static void browser__write(struct ui_browser *uib, void *entry, int row)
 	/* print the number */
 	for_each_group_evsel(evsel, leader) {
 		struct type_hist *h = adt->histograms[evsel->core.idx];
-		int idx = evsel__group_idx(evsel);
 
-		browser__write_overhead(uib, h, &be->hists[idx], row);
+		if (symbol_conf.skip_empty &&
+		    evsel__hists(evsel)->stats.nr_samples == 0)
+			continue;
+
+		browser__write_overhead(uib, h, &be->hists[idx++], row);
 	}
 
 	/* print type info */
@@ -300,8 +307,17 @@ int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel,
 
 	ui_helpline__push("Press ESC to exit");
 
-	if (evsel__is_group_event(evsel))
-		browser.nr_events = evsel->core.nr_members;
+	if (evsel__is_group_event(evsel)) {
+		struct evsel *pos;
+		int nr = 0;
+
+		for_each_group_evsel(pos, evsel) {
+			if (!symbol_conf.skip_empty ||
+			    evsel__hists(pos)->stats.nr_samples)
+				nr++;
+		}
+		browser.nr_events = nr;
+	}
 
 	ret = annotated_data_browser__collect_entries(&browser);
 	if (ret == 0)
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index c18272092a6b..ff85d190e3ac 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1502,10 +1502,15 @@ static void print_annotated_data_header(struct hist_entry *he, struct evsel *evs
 		struct evsel *pos;
 		int i = 0;
 
-		for_each_group_evsel(pos, evsel)
-			printf(" event[%d] = %s\n", i++, pos->name);
+		nr_members = 0;
+		for_each_group_evsel(pos, evsel) {
+			if (symbol_conf.skip_empty &&
+			    evsel__hists(pos)->stats.nr_samples == 0)
+				continue;
 
-		nr_members = evsel->core.nr_members;
+			printf(" event[%d] = %s\n", i++, pos->name);
+			nr_members++;
+		}
 	}
 
 	if (symbol_conf.show_total_period) {
@@ -1540,31 +1545,26 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type,
 {
 	struct annotated_member *child;
 	struct type_hist *h = mem_type->histograms[evsel->core.idx];
-	int i, nr_events = 1, samples = 0;
+	int i, nr_events = 0, samples = 0;
 	u64 period = 0;
 	int width = symbol_conf.show_total_period ? 11 : 7;
+	struct evsel *pos;
 
-	for (i = 0; i < member->size; i++) {
-		samples += h->addr[member->offset + i].nr_samples;
-		period += h->addr[member->offset + i].period;
-	}
-	print_annotated_data_value(h, period, samples);
-
-	if (evsel__is_group_event(evsel)) {
-		struct evsel *pos;
+	for_each_group_evsel(pos, evsel) {
+		h = mem_type->histograms[pos->core.idx];
 
-		for_each_group_member(pos, evsel) {
-			h = mem_type->histograms[pos->core.idx];
+		if (symbol_conf.skip_empty &&
+		    evsel__hists(pos)->stats.nr_samples == 0)
+			continue;
 
-			samples = 0;
-			period = 0;
-			for (i = 0; i < member->size; i++) {
-				samples += h->addr[member->offset + i].nr_samples;
-				period += h->addr[member->offset + i].period;
-			}
-			print_annotated_data_value(h, period, samples);
+		samples = 0;
+		period = 0;
+		for (i = 0; i < member->size; i++) {
+			samples += h->addr[member->offset + i].nr_samples;
+			period += h->addr[member->offset + i].period;
 		}
-		nr_events = evsel->core.nr_members;
+		print_annotated_data_value(h, period, samples);
+		nr_events++;
 	}
 
 	printf(" %10d %10d  %*s%s\t%s",
-- 
cgit v1.2.3


From 55850eb4e582f0696f40b8315ac31a45d6a4955e Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:37 +0200
Subject: tools/nolibc: arm: use clang-compatible asm syntax
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The clang assembler rejects the current syntax.
Switch to a syntax accepted by both GCC and clang.

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-1-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/arch-arm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index cae4afa7c1c7..d1c19d973e55 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -188,8 +188,8 @@
 void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
 {
 	__asm__ volatile (
-		"mov %r0, sp\n"         /* save stack pointer to %r0, as arg1 of _start_c */
-		"and ip, %r0, #-8\n"    /* sp must be 8-byte aligned in the callee        */
+		"mov r0, sp\n"          /* save stack pointer to %r0, as arg1 of _start_c */
+		"and ip, r0, #-8\n"     /* sp must be 8-byte aligned in the callee        */
 		"mov sp, ip\n"
 		"bl  _start_c\n"        /* transfer to c runtime                          */
 	);
-- 
cgit v1.2.3


From 0daf8c86a45163f35b8d4f7795068c2511dd0ad9 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:38 +0200
Subject: tools/nolibc: mips: load current function to $t9
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MIPS calling convention requires the address of the current function
to be available in $t9.
This was not done so far.

For GCC this seems to have worked, but when compiled with clang the
executable segfault instantly.
Properly load the address of _start_c() into $t9 before calling it.

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-2-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/arch-mips.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
index 62cc50ef3288..a2ee77ed2fbb 100644
--- a/tools/include/nolibc/arch-mips.h
+++ b/tools/include/nolibc/arch-mips.h
@@ -194,7 +194,9 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 		"li    $t0, -8\n"
 		"and   $sp, $sp, $t0\n"  /* $sp must be 8-byte aligned                     */
 		"addiu $sp, $sp, -16\n"  /* the callee expects to save a0..a3 there        */
-		"jal   _start_c\n"       /* transfer to c runtime                          */
+		"lui $t9, %hi(_start_c)\n" /* ABI requires current function address in $t9 */
+		"ori $t9, %lo(_start_c)\n"
+		"jalr $t9\n"             /* transfer to c runtime                          */
 		" nop\n"                 /* delayed slot                                   */
 		".set pop\n"
 	);
-- 
cgit v1.2.3


From 1daea158d0aae0770371f3079305a29fdb66829e Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:39 +0200
Subject: tools/nolibc: powerpc: limit stack-protector workaround to GCC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As mentioned in the comment, the workaround for
__attribute__((no_stack_protector)) is only necessary on GCC.
Avoid applying the workaround on clang, as clang does not recognize
__attribute__((__optimize__)) and would fail.

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-3-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/arch-powerpc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-powerpc.h b/tools/include/nolibc/arch-powerpc.h
index ac212e6185b2..41ebd394b90c 100644
--- a/tools/include/nolibc/arch-powerpc.h
+++ b/tools/include/nolibc/arch-powerpc.h
@@ -172,7 +172,7 @@
 	_ret;                                                                \
 })
 
-#ifndef __powerpc64__
+#if !defined(__powerpc64__) && !defined(__clang__)
 /* FIXME: For 32-bit PowerPC, with newer gcc compilers (e.g. gcc 13.1.0),
  * "omit-frame-pointer" fails with __attribute__((no_stack_protector)) but
  * works with __attribute__((__optimize__("-fno-stack-protector")))
-- 
cgit v1.2.3


From 02a62b551cee585f7c2c93f54d1230d5714ff7d4 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:40 +0200
Subject: tools/nolibc: compiler: introduce __nolibc_has_attribute()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent compilers support __has_attribute() to check if a certain
compiler attribute is supported.
Unfortunately we have to first check if __has_attribute is supported in
the first place and then if a specific attribute is present.
These two checks can't be folded into a single condition as that would
lead to errors.

Nesting the two conditions like below works, but becomes ugly as soon
as #else blocks are used as those need to be duplicated for both levels
of #if.

    #if defined __has_attribute
    #  if __has_attribute (nonnull)
    #    define ATTR_NONNULL __attribute__ ((nonnull))
    #  endif
    #endif

Introduce a new helper which makes the usage of __has_attribute() nicer
and migrate the current user to it.

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-4-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/compiler.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h
index beddc3665d69..1730d0454a55 100644
--- a/tools/include/nolibc/compiler.h
+++ b/tools/include/nolibc/compiler.h
@@ -6,20 +6,22 @@
 #ifndef _NOLIBC_COMPILER_H
 #define _NOLIBC_COMPILER_H
 
+#if defined(__has_attribute)
+#  define __nolibc_has_attribute(attr) __has_attribute(attr)
+#else
+#  define __nolibc_has_attribute(attr) 0
+#endif
+
 #if defined(__SSP__) || defined(__SSP_STRONG__) || defined(__SSP_ALL__) || defined(__SSP_EXPLICIT__)
 
 #define _NOLIBC_STACKPROTECTOR
 
 #endif /* defined(__SSP__) ... */
 
-#if defined(__has_attribute)
-#  if __has_attribute(no_stack_protector)
-#    define __no_stack_protector __attribute__((no_stack_protector))
-#  else
-#    define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector")))
-#  endif
+#if __nolibc_has_attribute(no_stack_protector)
+#  define __no_stack_protector __attribute__((no_stack_protector))
 #else
 #  define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector")))
-#endif /* defined(__has_attribute) */
+#endif /* __nolibc_has_attribute(no_stack_protector) */
 
 #endif /* _NOLIBC_COMPILER_H */
-- 
cgit v1.2.3


From d0f8a8973f265f6a276f99d091af99edfb2b87de Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 8 Aug 2024 00:14:13 +0000
Subject: mm/memblock: introduce a new helper
 memblock_estimated_nr_free_pages()

During bootup, system may need the number of free pages in the whole system
to do some calculation before all pages are freed to buddy system. Usually
this number is get from totalram_pages(). Since we plan to move the free
pages accounting in __free_pages_core(), this value may not represent
total free pages at the early stage, especially when
CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.

Instead of using raw memblock api, let's introduce a new helper for user
to get the estimated number of free pages from memblock point of view.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
CC: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20240808001415.6298-1-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 include/linux/memblock.h  |  1 +
 mm/memblock.c             | 17 +++++++++++++++++
 tools/include/linux/pfn.h |  1 +
 3 files changed, 19 insertions(+)

(limited to 'tools')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index fc4d75c6cec3..673d5cae7c81 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -467,6 +467,7 @@ static inline __init_memblock bool memblock_bottom_up(void)
 
 phys_addr_t memblock_phys_mem_size(void);
 phys_addr_t memblock_reserved_size(void);
+unsigned long memblock_estimated_nr_free_pages(void);
 phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
diff --git a/mm/memblock.c b/mm/memblock.c
index 3b9dc2d89b8a..213057603b65 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1731,6 +1731,23 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
 	return memblock.reserved.total_size;
 }
 
+/**
+ * memblock_estimated_nr_free_pages - return estimated number of free pages
+ * from memblock point of view
+ *
+ * During bootup, subsystems might need a rough estimate of the number of free
+ * pages in the whole system, before precise numbers are available from the
+ * buddy. Especially with CONFIG_DEFERRED_STRUCT_PAGE_INIT, the numbers
+ * obtained from the buddy might be very imprecise during bootup.
+ *
+ * Return:
+ * An estimated number of free pages from memblock point of view.
+ */
+unsigned long __init memblock_estimated_nr_free_pages(void)
+{
+	return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size());
+}
+
 /* lowest address */
 phys_addr_t __init_memblock memblock_start_of_DRAM(void)
 {
diff --git a/tools/include/linux/pfn.h b/tools/include/linux/pfn.h
index 7512a58189eb..f77a30d70152 100644
--- a/tools/include/linux/pfn.h
+++ b/tools/include/linux/pfn.h
@@ -7,4 +7,5 @@
 #define PFN_UP(x)	(((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
 #define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
 #define PFN_PHYS(x)	((phys_addr_t)(x) << PAGE_SHIFT)
+#define PHYS_PFN(x)	((unsigned long)((x) >> PAGE_SHIFT))
 #endif
-- 
cgit v1.2.3


From fe8340a750002269a3e4178efa1229a88814a656 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Fri, 2 Aug 2024 14:45:35 +0200
Subject: selftests: rust: config: add trailing newline

If adding multiple config files to the merge_config.sh script and
rust/config is the fist one, then the last config fragment in this file
and the first config fragment in the second file won't be set, since
there isn't a newline in this file, so those two fragements end up at
the same row like:
CONFIG_SAMPLE_RUST_PRINT=mCONFIG_FRAGMENT=y

And non of those will be enabled when running 'olddefconfig' after.

Fixing the issue by adding a newline to the file.

Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/rust/config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rust/config b/tools/testing/selftests/rust/config
index b4002acd40bc..fa06cebae232 100644
--- a/tools/testing/selftests/rust/config
+++ b/tools/testing/selftests/rust/config
@@ -2,4 +2,4 @@ CONFIG_RUST=y
 CONFIG_SAMPLES=y
 CONFIG_SAMPLES_RUST=y
 CONFIG_SAMPLE_RUST_MINIMAL=m
-CONFIG_SAMPLE_RUST_PRINT=m
\ No newline at end of file
+CONFIG_SAMPLE_RUST_PRINT=m
-- 
cgit v1.2.3


From 8afc0816f5f6213c2f40399317e2ecd43371729c Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Fri, 2 Aug 2024 14:45:36 +0200
Subject: selftests: rust: config: disable GCC_PLUGINS

CONFIG_RUST depends on !CONFIG_GCC_PLUGINS. Disable CONFIG_GCC_PLUGINS
in rust/config file to make sure it doesn't get enabled.

Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/rust/config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rust/config b/tools/testing/selftests/rust/config
index fa06cebae232..5f942b5c8c17 100644
--- a/tools/testing/selftests/rust/config
+++ b/tools/testing/selftests/rust/config
@@ -1,3 +1,4 @@
+# CONFIG_GCC_PLUGINS is not set
 CONFIG_RUST=y
 CONFIG_SAMPLES=y
 CONFIG_SAMPLES_RUST=y
-- 
cgit v1.2.3


From 10fbe8c082fdde74b1f0814bc30e194cf330cdf7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 9 Aug 2024 22:37:17 -0700
Subject: selftests: drv-net: rss_ctx: add identifier to traffic comments

Include the "name" of the context in the comment for traffic
checks. Makes it easier to reason about which context failed
when we loop over 32 contexts (it may matter if we failed in
first vs last, for example).

Reviewed-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/drivers/net/hw/rss_ctx.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index 011508ca604b..1da6b214f4fe 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -90,10 +90,10 @@ def _send_traffic_check(cfg, port, name, params):
     ksft_ge(directed, 20000, f"traffic on {name}: " + str(cnts))
     if params.get('noise'):
         ksft_lt(sum(cnts[i] for i in params['noise']), directed / 2,
-                "traffic on other queues:" + str(cnts))
+                f"traffic on other queues ({name})':" + str(cnts))
     if params.get('empty'):
         ksft_eq(sum(cnts[i] for i in params['empty']), 0,
-                "traffic on inactive queues: " + str(cnts))
+                f"traffic on inactive queues ({name}): " + str(cnts))
 
 
 def test_rss_key_indir(cfg):
-- 
cgit v1.2.3


From c1ad8ef804e40fc1ef3cb008e0113d76f4acc1a0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 9 Aug 2024 22:37:28 -0700
Subject: selftests: drv-net: rss_ctx: test dumping RSS contexts

Add a test for dumping RSS contexts. Make sure indir table
and key are sane when contexts are created with various
combination of inputs. Test the dump filtering by ifname
and start-context.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/drivers/net/hw/rss_ctx.py | 76 ++++++++++++++++++++++-
 tools/testing/selftests/net/lib/py/ksft.py        |  6 ++
 2 files changed, 80 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index 1da6b214f4fe..9d7adb3cf33b 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -3,7 +3,7 @@
 
 import datetime
 import random
-from lib.py import ksft_run, ksft_pr, ksft_exit, ksft_eq, ksft_ge, ksft_lt
+from lib.py import ksft_run, ksft_pr, ksft_exit, ksft_eq, ksft_ne, ksft_ge, ksft_lt
 from lib.py import NetDrvEpEnv
 from lib.py import EthtoolFamily, NetdevFamily
 from lib.py import KsftSkipEx
@@ -302,6 +302,78 @@ def test_hitless_key_update(cfg):
     ksft_eq(carrier1 - carrier0, 0)
 
 
+def test_rss_context_dump(cfg):
+    """
+    Test dumping RSS contexts. This tests mostly exercises the kernel APIs.
+    """
+
+    # Get a random key of the right size
+    data = get_rss(cfg)
+    if 'rss-hash-key' in data:
+        key_data = _rss_key_rand(len(data['rss-hash-key']))
+        key = _rss_key_str(key_data)
+    else:
+        key_data = []
+        key = "ba:ad"
+
+    ids = []
+    try:
+        ids.append(ethtool_create(cfg, "-X", f"context new"))
+        defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete")
+
+        ids.append(ethtool_create(cfg, "-X", f"context new weight 1 1"))
+        defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete")
+
+        ids.append(ethtool_create(cfg, "-X", f"context new hkey {key}"))
+        defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete")
+    except CmdExitFailure:
+        if not ids:
+            raise KsftSkipEx("Unable to add any contexts")
+        ksft_pr(f"Added only {len(ids)} out of 3 contexts")
+
+    expect_tuples = set([(cfg.ifname, -1)] + [(cfg.ifname, ctx_id) for ctx_id in ids])
+
+    # Dump all
+    ctxs = cfg.ethnl.rss_get({}, dump=True)
+    tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs]
+    ksft_eq(len(tuples), len(set(tuples)), "duplicates in context dump")
+    ctx_tuples = set([ctx for ctx in tuples if ctx[0] == cfg.ifname])
+    ksft_eq(expect_tuples, ctx_tuples)
+
+    # Sanity-check the results
+    for data in ctxs:
+        ksft_ne(set(data['indir']), {0}, "indir table is all zero")
+        ksft_ne(set(data.get('hkey', [1])), {0}, "key is all zero")
+
+        # More specific checks
+        if len(ids) > 1 and data.get('context') == ids[1]:
+            ksft_eq(set(data['indir']), {0, 1},
+                    "ctx1 - indir table mismatch")
+        if len(ids) > 2 and data.get('context') == ids[2]:
+            ksft_eq(data['hkey'], bytes(key_data), "ctx2 - key mismatch")
+
+    # Ifindex filter
+    ctxs = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}, dump=True)
+    tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs]
+    ctx_tuples = set(tuples)
+    ksft_eq(len(tuples), len(ctx_tuples), "duplicates in context dump")
+    ksft_eq(expect_tuples, ctx_tuples)
+
+    # Skip ctx 0
+    expect_tuples.remove((cfg.ifname, -1))
+
+    ctxs = cfg.ethnl.rss_get({'start-context': 1}, dump=True)
+    tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs]
+    ksft_eq(len(tuples), len(set(tuples)), "duplicates in context dump")
+    ctx_tuples = set([ctx for ctx in tuples if ctx[0] == cfg.ifname])
+    ksft_eq(expect_tuples, ctx_tuples)
+
+    # And finally both with ifindex and skip main
+    ctxs = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}, 'start-context': 1}, dump=True)
+    ctx_tuples = set([(c['header']['dev-name'], c.get('context', -1)) for c in ctxs])
+    ksft_eq(expect_tuples, ctx_tuples)
+
+
 def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None):
     """
     Test separating traffic into RSS contexts.
@@ -542,7 +614,7 @@ def main() -> None:
         ksft_run([test_rss_key_indir, test_rss_queue_reconfigure,
                   test_rss_resize, test_hitless_key_update,
                   test_rss_context, test_rss_context4, test_rss_context32,
-                  test_rss_context_queue_reconfigure,
+                  test_rss_context_dump, test_rss_context_queue_reconfigure,
                   test_rss_context_overlap, test_rss_context_overlap2,
                   test_rss_context_out_of_order, test_rss_context4_create_with_cfg],
                  args=(cfg, ))
diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index f663e0daec0d..477ae76de93d 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -55,6 +55,12 @@ def ksft_eq(a, b, comment=""):
         _fail("Check failed", a, "!=", b, comment)
 
 
+def ksft_ne(a, b, comment=""):
+    global KSFT_RESULT
+    if a == b:
+        _fail("Check failed", a, "==", b, comment)
+
+
 def ksft_true(a, comment=""):
     if not a:
         _fail("Check failed", a, "does not eval to True", comment)
-- 
cgit v1.2.3


From 4f21bfed691c96da1fc85b76c16e8f6d8fe98a3d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 12 Aug 2024 09:57:20 -0300
Subject: perf tests pmu: Initialize all fields of test_pmu variable

Instead of explicitely initializing just the .name and .alias_name,
use struct member named initialization of just the non-null -name field,
the compiler will initialize all the other non-explicitely initialized
fields to NULL.

This makes the code more robust, avoiding the error recently fixed when
the .alias_name was used and contained a random value.

Reviewed-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Radostin Stoyanov <rstoyano@redhat.com>
Link: https://lore.kernel.org/lkml/e26941f9-f86c-4f2e-b812-20c49fb2c0d3@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/pmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index a4730b5dc0d9..be18506f6a24 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -458,10 +458,10 @@ static int test__name_cmp(struct test_suite *test __maybe_unused, int subtest __
  */
 static int test__pmu_match(struct test_suite *test __maybe_unused, int subtest __maybe_unused)
 {
-	struct perf_pmu test_pmu;
-	test_pmu.alias_name = NULL;
+	struct perf_pmu test_pmu = {
+		.name = "pmuname",
+	};
 
-	test_pmu.name = "pmuname";
 	TEST_ASSERT_EQUAL("Exact match", perf_pmu__match(&test_pmu, "pmuname"),	     true);
 	TEST_ASSERT_EQUAL("Longer token", perf_pmu__match(&test_pmu, "longertoken"), false);
 	TEST_ASSERT_EQUAL("Shorter token", perf_pmu__match(&test_pmu, "pmu"),	     false);
-- 
cgit v1.2.3


From e6b56ae7c2d82976398fdbf00858f31193cf5971 Mon Sep 17 00:00:00 2001
From: Martin Liška <m.liska@foxlink.cz>
Date: Fri, 19 Jul 2024 12:57:08 +0200
Subject: perf script: add --addr2line option

Similarly to other subcommands (like report, top), it would be handy to
provide a path for addr2line command.

Signed-off-by: Martin Liska <martin.liska@hey.com>
Cc: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/eadc3e36-029d-4848-9d69-272fe5a83a26@foxlink.cz
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-script.txt | 3 +++
 tools/perf/builtin-script.c              | 2 ++
 tools/perf/util/symbol_conf.h            | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index ff086ef05a0c..5abb960c4960 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -369,6 +369,9 @@ OPTIONS
 --demangle-kernel::
 	Demangle kernel symbol names to human readable form (for C++ kernels).
 
+--addr2line=<path>::
+	Path to addr2line binary.
+
 --header
 	Show perf.data header.
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c16224b1fef3..932167b2362b 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -4052,6 +4052,8 @@ int cmd_script(int argc, const char **argv)
 			"Enable symbol demangling"),
 	OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel,
 			"Enable kernel symbol demangling"),
+	OPT_STRING(0, "addr2line", &symbol_conf.addr2line_path, "path",
+			"addr2line binary to use for line numbers"),
 	OPT_STRING(0, "time", &script.time_str, "str",
 		   "Time span of interest (start,stop)"),
 	OPT_BOOLEAN(0, "inline", &symbol_conf.inline_name,
diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
index 657cfa5af43c..a9c51acc722f 100644
--- a/tools/perf/util/symbol_conf.h
+++ b/tools/perf/util/symbol_conf.h
@@ -64,7 +64,7 @@ struct symbol_conf {
 			*sym_list_str,
 			*col_width_list_str,
 			*bt_stop_list_str;
-	char		*addr2line_path;
+	const char		*addr2line_path;
 	unsigned long	time_quantum;
        struct strlist	*dso_list,
 			*comm_list,
-- 
cgit v1.2.3


From 043da846c2b2dfd5b187bf3a9993b8fb0a6ed94a Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Mon, 12 Aug 2024 10:34:59 +0100
Subject: perf docs: Refine the description for the buffer size

Current description for the AUX trace buffer size is misleading. When a
user specifies the option '-m,512M', it represents a size value in bytes
(512MiB) but not 512M pages (512M x 4KiB regard to a page of 4KiB).

Make the document clear that the normal buffer and the AUX tracing
buffer share the same semantics. Syncs the documents for consistent
text.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240812093459.2575278-1-leo.yan@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-kvm.txt    | 6 +++---
 tools/perf/Documentation/perf-record.txt | 9 +++++----
 tools/perf/Documentation/perf-top.txt    | 4 ++--
 tools/perf/Documentation/perf-trace.txt  | 4 ++--
 4 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt
index b66be66fe836..c26524d38f47 100644
--- a/tools/perf/Documentation/perf-kvm.txt
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -115,9 +115,9 @@ STAT LIVE OPTIONS
 
 -m::
 --mmap-pages=::
-    Number of mmap data pages (must be a power of two) or size
-    specification with appended unit character - B/K/M/G. The
-    size is rounded up to have nearest pages power of two value.
+        Number of mmap data pages (must be a power of two) or size
+        specification in bytes with appended unit character - B/K/M/G.
+        The size is rounded up to the nearest power-of-two page value.
 
 -a::
 --all-cpus::
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 41e36b4dc765..242223240a08 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -273,10 +273,11 @@ OPTIONS
 -m::
 --mmap-pages=::
 	Number of mmap data pages (must be a power of two) or size
-	specification with appended unit character - B/K/M/G. The
-	size is rounded up to have nearest pages power of two value.
-	Also, by adding a comma, the number of mmap pages for AUX
-	area tracing can be specified.
+	specification in bytes with appended unit character - B/K/M/G.
+	The size is rounded up to the nearest power-of-two page value.
+	By adding a comma, an additional parameter with the same
+	semantics used for the normal mmap areas can be specified for
+	AUX tracing area.
 
 -g::
 	Enables call-graph (stack chain/backtrace) recording for both
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 667e5102075e..af3e4230c72f 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -83,8 +83,8 @@ Default is to monitor all CPUS.
 -m <pages>::
 --mmap-pages=<pages>::
 	Number of mmap data pages (must be a power of two) or size
-	specification with appended unit character - B/K/M/G. The
-	size is rounded up to have nearest pages power of two value.
+	specification in bytes with appended unit character - B/K/M/G.
+	The size is rounded up to the nearest power-of-two page value.
 
 -p <pid>::
 --pid=<pid>::
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index f0da8cf63e9a..6e0cc50bbc13 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -106,8 +106,8 @@ filter out the startup phase of the program, which is often very different.
 -m::
 --mmap-pages=::
 	Number of mmap data pages (must be a power of two) or size
-	specification with appended unit character - B/K/M/G. The
-	size is rounded up to have nearest pages power of two value.
+	specification in bytes with appended unit character - B/K/M/G.
+	The size is rounded up to the nearest power-of-two page value.
 
 -C::
 --cpu::
-- 
cgit v1.2.3


From 4a4c013d3385b0db85dc361203dc42ff048b6fd6 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Sat, 10 Aug 2024 10:35:04 +0100
Subject: libbpf: Fix license for btf_relocate.c

License should be

// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)

...as with other libbpf files.

Fixes: 19e00c897d50 ("libbpf: Split BTF relocation")
Reported-by: Neill Kapron <nkapron@google.com>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20240810093504.2111134-1-alan.maguire@oracle.com
---
 tools/lib/bpf/btf_relocate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf_relocate.c b/tools/lib/bpf/btf_relocate.c
index 17f8b32f94a0..4f7399d85eab 100644
--- a/tools/lib/bpf/btf_relocate.c
+++ b/tools/lib/bpf/btf_relocate.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 /* Copyright (c) 2024, Oracle and/or its affiliates. */
 
 #ifndef _GNU_SOURCE
-- 
cgit v1.2.3


From ef32e9b6a325d1d013b30d898b4dff94082902cd Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:41 +0200
Subject: tools/nolibc: move entrypoint specifics to compiler.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The specific attributes for the _start entrypoint are duplicated for
each architecture.
Deduplicate it into a dedicated #define into compiler.h.

For clang compatibility, the epilogue will also need to be adapted, so
move that one, too.

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-5-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/arch-aarch64.h   | 4 ++--
 tools/include/nolibc/arch-arm.h       | 4 ++--
 tools/include/nolibc/arch-i386.h      | 4 ++--
 tools/include/nolibc/arch-loongarch.h | 4 ++--
 tools/include/nolibc/arch-mips.h      | 4 ++--
 tools/include/nolibc/arch-powerpc.h   | 4 ++--
 tools/include/nolibc/arch-riscv.h     | 4 ++--
 tools/include/nolibc/arch-s390.h      | 4 ++--
 tools/include/nolibc/arch-x86_64.h    | 4 ++--
 tools/include/nolibc/compiler.h       | 3 +++
 10 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h
index b23ac1f04035..06fdef7b291a 100644
--- a/tools/include/nolibc/arch-aarch64.h
+++ b/tools/include/nolibc/arch-aarch64.h
@@ -142,13 +142,13 @@
 })
 
 /* startup code */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 	__asm__ volatile (
 		"mov x0, sp\n"          /* save stack pointer to x0, as arg1 of _start_c */
 		"and sp, x0, -16\n"     /* sp must be 16-byte aligned in the callee      */
 		"bl  _start_c\n"        /* transfer to c runtime                         */
 	);
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 #endif /* _NOLIBC_ARCH_AARCH64_H */
diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index d1c19d973e55..6180ff99ab43 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -185,7 +185,7 @@
 })
 
 /* startup code */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 	__asm__ volatile (
 		"mov r0, sp\n"          /* save stack pointer to %r0, as arg1 of _start_c */
@@ -193,7 +193,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 		"mov sp, ip\n"
 		"bl  _start_c\n"        /* transfer to c runtime                          */
 	);
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 
 #endif /* _NOLIBC_ARCH_ARM_H */
diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h
index 28c26a00a762..ff5afc35bbd8 100644
--- a/tools/include/nolibc/arch-i386.h
+++ b/tools/include/nolibc/arch-i386.h
@@ -162,7 +162,7 @@
  * 2) The deepest stack frame should be set to zero
  *
  */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 	__asm__ volatile (
 		"xor  %ebp, %ebp\n"       /* zero the stack frame                                */
@@ -174,7 +174,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 		"call _start_c\n"         /* transfer to c runtime                               */
 		"hlt\n"                   /* ensure it does not return                           */
 	);
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 
 #endif /* _NOLIBC_ARCH_I386_H */
diff --git a/tools/include/nolibc/arch-loongarch.h b/tools/include/nolibc/arch-loongarch.h
index 3f8ef8f86c0f..fb519545959e 100644
--- a/tools/include/nolibc/arch-loongarch.h
+++ b/tools/include/nolibc/arch-loongarch.h
@@ -149,14 +149,14 @@
 #endif
 
 /* startup code */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 	__asm__ volatile (
 		"move          $a0, $sp\n"         /* save stack pointer to $a0, as arg1 of _start_c */
 		LONG_BSTRINS " $sp, $zero, 3, 0\n" /* $sp must be 16-byte aligned                    */
 		"bl            _start_c\n"         /* transfer to c runtime                          */
 	);
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 
 #endif /* _NOLIBC_ARCH_LOONGARCH_H */
diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
index a2ee77ed2fbb..1791a8ce58da 100644
--- a/tools/include/nolibc/arch-mips.h
+++ b/tools/include/nolibc/arch-mips.h
@@ -179,7 +179,7 @@
 })
 
 /* startup code, note that it's called __start on MIPS */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector __start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void)
 {
 	__asm__ volatile (
 		".set push\n"
@@ -200,7 +200,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 		" nop\n"                 /* delayed slot                                   */
 		".set pop\n"
 	);
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 
 #endif /* _NOLIBC_ARCH_MIPS_H */
diff --git a/tools/include/nolibc/arch-powerpc.h b/tools/include/nolibc/arch-powerpc.h
index 41ebd394b90c..ee2fdb8d601d 100644
--- a/tools/include/nolibc/arch-powerpc.h
+++ b/tools/include/nolibc/arch-powerpc.h
@@ -184,7 +184,7 @@
 #endif /* !__powerpc64__ */
 
 /* startup code */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 #ifdef __powerpc64__
 #if _CALL_ELF == 2
@@ -215,7 +215,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 		"bl     _start_c\n"     /* transfer to c runtime                         */
 	);
 #endif
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 
 #endif /* _NOLIBC_ARCH_POWERPC_H */
diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h
index 1927c643c739..8827bf936212 100644
--- a/tools/include/nolibc/arch-riscv.h
+++ b/tools/include/nolibc/arch-riscv.h
@@ -140,7 +140,7 @@
 })
 
 /* startup code */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 	__asm__ volatile (
 		".option push\n"
@@ -151,7 +151,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 		"andi sp, a0, -16\n"      /* sp must be 16-byte aligned                    */
 		"call _start_c\n"         /* transfer to c runtime                         */
 	);
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 
 #endif /* _NOLIBC_ARCH_RISCV_H */
diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index 5d60fd43f883..2ec13d8b9a2d 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -139,7 +139,7 @@
 })
 
 /* startup code */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 	__asm__ volatile (
 		"lgr	%r2, %r15\n"          /* save stack pointer to %r2, as arg1 of _start_c */
@@ -147,7 +147,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 		"xc	0(8,%r15), 0(%r15)\n" /* clear backchain                                */
 		"brasl	%r14, _start_c\n"     /* transfer to c runtime                          */
 	);
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 
 struct s390_mmap_arg_struct {
diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index 68609f421934..65252c005a30 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -161,7 +161,7 @@
  * 2) The deepest stack frame should be zero (the %rbp).
  *
  */
-void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 	__asm__ volatile (
 		"xor  %ebp, %ebp\n"       /* zero the stack frame                            */
@@ -170,7 +170,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 		"call _start_c\n"         /* transfer to c runtime                           */
 		"hlt\n"                   /* ensure it does not return                       */
 	);
-	__builtin_unreachable();
+	__nolibc_entrypoint_epilogue();
 }
 
 #define NOLIBC_ARCH_HAS_MEMMOVE
diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h
index 1730d0454a55..3a1d5b8d4fb9 100644
--- a/tools/include/nolibc/compiler.h
+++ b/tools/include/nolibc/compiler.h
@@ -12,6 +12,9 @@
 #  define __nolibc_has_attribute(attr) 0
 #endif
 
+#define __nolibc_entrypoint __attribute__((optimize("Os", "omit-frame-pointer")))
+#define __nolibc_entrypoint_epilogue() __builtin_unreachable()
+
 #if defined(__SSP__) || defined(__SSP_STRONG__) || defined(__SSP_ALL__) || defined(__SSP_EXPLICIT__)
 
 #define _NOLIBC_STACKPROTECTOR
-- 
cgit v1.2.3


From e098eebb63cb1c03813559b5db9da4451ba3a318 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:42 +0200
Subject: tools/nolibc: compiler: use attribute((naked)) if available
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current entrypoint attributes optimize("Os", "omit-frame-pointer")
are intended to avoid all compiler generated code, like function
porologue and epilogue.
This is the exact usecase implemented by the attribute "naked".

Unfortunately this is not implemented by GCC for all targets,
so only use it where available.
This also provides compatibility with clang, which recognizes the
"naked" attribute but not the previously used attribute "optimized".

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-6-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/compiler.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h
index 3a1d5b8d4fb9..9bc6a706a332 100644
--- a/tools/include/nolibc/compiler.h
+++ b/tools/include/nolibc/compiler.h
@@ -12,8 +12,13 @@
 #  define __nolibc_has_attribute(attr) 0
 #endif
 
-#define __nolibc_entrypoint __attribute__((optimize("Os", "omit-frame-pointer")))
-#define __nolibc_entrypoint_epilogue() __builtin_unreachable()
+#if __nolibc_has_attribute(naked)
+#  define __nolibc_entrypoint __attribute__((naked))
+#  define __nolibc_entrypoint_epilogue()
+#else
+#  define __nolibc_entrypoint __attribute__((optimize("Os", "omit-frame-pointer")))
+#  define __nolibc_entrypoint_epilogue() __builtin_unreachable()
+#endif /* __nolibc_has_attribute(naked) */
 
 #if defined(__SSP__) || defined(__SSP_STRONG__) || defined(__SSP_ALL__) || defined(__SSP_EXPLICIT__)
 
-- 
cgit v1.2.3


From ddae1d7fab8c5dc5d12da120dc410c4f374d37c3 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:43 +0200
Subject: selftests/nolibc: report failure if no testcase passed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When nolibc-test is so broken, it doesn't even start,
don't report success.

Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-7-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 3fbabab46958..46dfbb50fae5 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -157,7 +157,7 @@ LDFLAGS :=
 
 REPORT  ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++; print;} /\[SKIPPED\][\r]*$$/{s++} \
 		END{ printf("\n%3d test(s): %3d passed, %3d skipped, %3d failed => status: ", p+s+f, p, s, f); \
-		if (f) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \
+		if (f || !p) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \
 		printf("\nSee all results in %s\n", ARGV[1]); }'
 
 help:
-- 
cgit v1.2.3


From f1a58f61d88642ae1e6e97e9d72d73bc70a93cb8 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:44 +0200
Subject: selftests/nolibc: avoid passing NULL to printf("%s")
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clang on higher optimization levels detects that NULL is passed to
printf("%s") and warns about it.
While printf() from nolibc gracefully handles that NULL,
it is undefined behavior as per POSIX, so the warning is reasonable.
Avoid the warning by transforming NULL into a non-NULL placeholder.

Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-8-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 0800b10fc3f7..6fba7025c5e3 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -542,7 +542,7 @@ int expect_strzr(const char *expr, int llen)
 {
 	int ret = 0;
 
-	llen += printf(" = <%s> ", expr);
+	llen += printf(" = <%s> ", expr ? expr : "(null)");
 	if (expr) {
 		ret = 1;
 		result(llen, FAIL);
@@ -561,7 +561,7 @@ int expect_strnz(const char *expr, int llen)
 {
 	int ret = 0;
 
-	llen += printf(" = <%s> ", expr);
+	llen += printf(" = <%s> ", expr ? expr : "(null)");
 	if (!expr) {
 		ret = 1;
 		result(llen, FAIL);
-- 
cgit v1.2.3


From 1a1200b66fd52dca33255270a09a093592c3b877 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:45 +0200
Subject: selftests/nolibc: determine $(srctree) first
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nolibc-test Makefile includes various other Makefiles from the tree.
At first these are included with relative paths like
"../../../build/Build.include" but as soon as $(srctree) is set up,
the inclusions use that instead to build full paths.

To keep the style of inclusions consistent, perform the setup
$(srctree) as early as possible and use it for all inclusions.

Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-9-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 46dfbb50fae5..803a4e1bbe24 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -1,9 +1,14 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for nolibc tests
-include ../../../scripts/Makefile.include
-include ../../../scripts/utilities.mak
+# we're in ".../tools/testing/selftests/nolibc"
+ifeq ($(srctree),)
+srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR)))
+endif
+
+include $(srctree)/tools/scripts/Makefile.include
+include $(srctree)/tools/scripts/utilities.mak
 # We need this for the "cc-option" macro.
-include ../../../build/Build.include
+include $(srctree)/tools/build/Build.include
 
 ifneq ($(O),)
 ifneq ($(call is-absolute,$(O)),y)
@@ -11,11 +16,6 @@ $(error Only absolute O= parameters are supported)
 endif
 endif
 
-# we're in ".../tools/testing/selftests/nolibc"
-ifeq ($(srctree),)
-srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR)))
-endif
-
 ifeq ($(ARCH),)
 include $(srctree)/scripts/subarch.include
 ARCH = $(SUBARCH)
-- 
cgit v1.2.3


From ae574ae37059da67b90916f14b482bbce0c0ab01 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:46 +0200
Subject: selftests/nolibc: add support for LLVM= parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Makefile.include can modify CC and CFLAGS for usage with clang.
Make use of it.

Makefile.include is currently used to handle the O= variable.
This is incompatible with the LLVM= handling as for O= it has to be
included as early as possible, while for LLVM= it needs to be included
after CFLAGS are set up.

To avoid this incompatibility, switch the O= handling to custom logic.

Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-10-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 803a4e1bbe24..cdff317c35f2 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -5,7 +5,6 @@ ifeq ($(srctree),)
 srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR)))
 endif
 
-include $(srctree)/tools/scripts/Makefile.include
 include $(srctree)/tools/scripts/utilities.mak
 # We need this for the "cc-option" macro.
 include $(srctree)/tools/build/Build.include
@@ -14,6 +13,9 @@ ifneq ($(O),)
 ifneq ($(call is-absolute,$(O)),y)
 $(error Only absolute O= parameters are supported)
 endif
+objtree := $(O)
+else
+objtree ?= $(srctree)
 endif
 
 ifeq ($(ARCH),)
@@ -21,8 +23,6 @@ include $(srctree)/scripts/subarch.include
 ARCH = $(SUBARCH)
 endif
 
-objtree ?= $(srctree)
-
 # XARCH extends the kernel's ARCH with a few variants of the same
 # architecture that only differ by the configuration, the toolchain
 # and the Qemu program used. It is copied as-is into ARCH except for
@@ -155,6 +155,9 @@ CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wex
 		$(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA)
 LDFLAGS :=
 
+# Modify CFLAGS based on LLVM=
+include $(srctree)/tools/scripts/Makefile.include
+
 REPORT  ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++; print;} /\[SKIPPED\][\r]*$$/{s++} \
 		END{ printf("\n%3d test(s): %3d passed, %3d skipped, %3d failed => status: ", p+s+f, p, s, f); \
 		if (f || !p) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \
-- 
cgit v1.2.3


From 1bd75aeb5446eb2b1351465e88a98fd784003250 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:47 +0200
Subject: selftests/nolibc: add cc-option compatible with clang cross builds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cc-option macro from Build.include is not compatible with clang
cross builds, as it does not respect the "--target" and similar flags,
set up by Mekfile.include.

Provide a custom variant which works correctly.

Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-11-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index cdff317c35f2..b8577086e008 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -6,8 +6,8 @@ srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR)))
 endif
 
 include $(srctree)/tools/scripts/utilities.mak
-# We need this for the "cc-option" macro.
-include $(srctree)/tools/build/Build.include
+# We need this for the "__cc-option" macro.
+include $(srctree)/scripts/Makefile.compiler
 
 ifneq ($(O),)
 ifneq ($(call is-absolute,$(O)),y)
@@ -23,6 +23,8 @@ include $(srctree)/scripts/subarch.include
 ARCH = $(SUBARCH)
 endif
 
+cc-option = $(call __cc-option, $(CC),$(CLANG_CROSS_FLAGS),$(1),$(2))
+
 # XARCH extends the kernel's ARCH with a few variants of the same
 # architecture that only differ by the configuration, the toolchain
 # and the Qemu program used. It is copied as-is into ARCH except for
-- 
cgit v1.2.3


From 27e458bbebdb0aa2aecce86d22c7f02b66e68ee1 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:48 +0200
Subject: selftests/nolibc: run-tests.sh: avoid overwriting CFLAGS_EXTRA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the user specified their own CFLAGS_EXTRA these should not be
overwritten by `-e`.

Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-12-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/run-tests.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index 0446e6326a40..324509b99e2c 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -15,7 +15,7 @@ download_location="${cache_dir}/crosstools/"
 build_location="$(realpath "${cache_dir}"/nolibc-tests/)"
 perform_download=0
 test_mode=system
-CFLAGS_EXTRA="-Werror"
+werror=1
 archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch"
 
 TEMP=$(getopt -o 'j:d:c:b:a:m:peh' -n "$0" -- "$@")
@@ -69,7 +69,7 @@ while true; do
 			test_mode="$2"
 			shift 2; continue ;;
 		'-e')
-			CFLAGS_EXTRA=""
+			werror=0
 			shift; continue ;;
 		'-h')
 			print_usage
@@ -140,6 +140,9 @@ test_arch() {
 	ct_abi=$(crosstool_abi "$1")
 	cross_compile=$(realpath "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/${ct_arch}-${ct_abi}-")
 	build_dir="${build_location}/${arch}"
+	if [ "$werror" -ne 0 ]; then
+		CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror"
+	fi
 	MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" O="${build_dir}")
 
 	mkdir -p "$build_dir"
-- 
cgit v1.2.3


From 801cf69ca03040a75ed73be263aa6f0fdcb8af5d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:49 +0200
Subject: selftests/nolibc: don't use libgcc when building with clang
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The logic in clang to find the libgcc.a from a GCC toolchain for a
specific ABI does not work reliably and can lead to errors.
Instead disable libgcc when building with clang, as it's not needed
anyways.

Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-13-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index b8577086e008..f0ce8264baac 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -157,6 +157,13 @@ CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wex
 		$(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA)
 LDFLAGS :=
 
+LIBGCC := -lgcc
+
+ifneq ($(LLVM),)
+# Not needed for clang
+LIBGCC :=
+endif
+
 # Modify CFLAGS based on LLVM=
 include $(srctree)/tools/scripts/Makefile.include
 
@@ -209,11 +216,11 @@ sysroot/$(ARCH)/include:
 ifneq ($(NOLIBC_SYSROOT),0)
 nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include
 	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
-	  -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c -lgcc
+	  -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c $(LIBGCC)
 else
 nolibc-test: nolibc-test.c nolibc-test-linkage.c
 	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
-	  -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c -lgcc
+	  -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c $(LIBGCC)
 endif
 
 libc-test: nolibc-test.c nolibc-test-linkage.c
-- 
cgit v1.2.3


From 8404af7e13eeef91d0e69b44214cbafc2767b7f2 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:50 +0200
Subject: selftests/nolibc: use correct clang target for s390/systemz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The target names between GCC and clang differ for s390.
While GCC uses "s390", clang uses "systemz".
This mapping is not handled by tools/scripts/Makefile.include,
so do it in the nolibc-test Makefile.

Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-14-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index f0ce8264baac..8de98ea7af80 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -167,6 +167,9 @@ endif
 # Modify CFLAGS based on LLVM=
 include $(srctree)/tools/scripts/Makefile.include
 
+# GCC uses "s390", clang "systemz"
+CLANG_CROSS_FLAGS := $(subst --target=s390-linux,--target=systemz-linux,$(CLANG_CROSS_FLAGS))
+
 REPORT  ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++; print;} /\[SKIPPED\][\r]*$$/{s++} \
 		END{ printf("\n%3d test(s): %3d passed, %3d skipped, %3d failed => status: ", p+s+f, p, s, f); \
 		if (f || !p) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \
-- 
cgit v1.2.3


From 22ba81c50a49468d80055674d8d6f78afb1c92c4 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 7 Aug 2024 23:51:51 +0200
Subject: selftests/nolibc: run-tests.sh: allow building through LLVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nolibc tests can now be properly built with LLVM.
Expose this through run-tests.sh.

Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-15-c20f2f5fc7c2@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/run-tests.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index 324509b99e2c..e7ecda4ae796 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -16,9 +16,10 @@ build_location="$(realpath "${cache_dir}"/nolibc-tests/)"
 perform_download=0
 test_mode=system
 werror=1
+llvm=
 archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch"
 
-TEMP=$(getopt -o 'j:d:c:b:a:m:peh' -n "$0" -- "$@")
+TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@")
 
 eval set -- "$TEMP"
 unset TEMP
@@ -42,6 +43,7 @@ Options:
  -b [DIR]       Build location (default: ${build_location})
  -m [MODE]      Test mode user/system (default: ${test_mode})
  -e             Disable -Werror
+ -l             Build with LLVM/clang
 EOF
 }
 
@@ -71,6 +73,9 @@ while true; do
 		'-e')
 			werror=0
 			shift; continue ;;
+		'-l')
+			llvm=1
+			shift; continue ;;
 		'-h')
 			print_usage
 			exit 0
@@ -143,7 +148,7 @@ test_arch() {
 	if [ "$werror" -ne 0 ]; then
 		CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror"
 	fi
-	MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" O="${build_dir}")
+	MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}")
 
 	mkdir -p "$build_dir"
 	if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then
-- 
cgit v1.2.3


From 00b04242683ebae91164244883b46e6ab2669d63 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 10 Aug 2024 12:15:02 -0700
Subject: perf annotate-data: Fix a buffer overflow in TUI browser

In get_member_overhead(), k is updated when it has a entry in the
histogram.  But the entry->hists array is allocated with the number of
evsel in the group.  So the k should be reset when it iterates the event
using for_each_group_evsel(), otherwise it'd crash due to a buffer
overflow.

Fixes: cb1898f58e0f175d ("perf annotate-data: Support --skip-empty option")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240810191502.1947959-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate-data.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c
index c3db80a7589a..a937b55da736 100644
--- a/tools/perf/ui/browsers/annotate-data.c
+++ b/tools/perf/ui/browsers/annotate-data.c
@@ -46,11 +46,12 @@ static int get_member_overhead(struct annotated_data_type *adt,
 	struct annotated_member *member = entry->data;
 	int i, k;
 
-	for (i = 0, k = 0; i < member->size; i++) {
+	for (i = 0; i < member->size; i++) {
 		struct type_hist *h;
 		struct evsel *evsel;
 		int offset = member->offset + i;
 
+		k = 0;
 		for_each_group_evsel(evsel, leader) {
 			if (symbol_conf.skip_empty &&
 			    evsel__hists(evsel)->stats.nr_samples == 0)
-- 
cgit v1.2.3


From 040c0f887fdcfe747a3f63c94e9cd29e9ed0b872 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 12 Aug 2024 10:25:33 -0700
Subject: perf lock contention: Change stack_id type to s32

The bpf_get_stackid() helper returns a signed type to check whether it
failed to get a stacktrace or not.  But it saved the result in u32 and
checked if the value is negative.

      376         if (needs_callstack) {
      377                 pelem->stack_id = bpf_get_stackid(ctx, &stacks,
      378                                                   BPF_F_FAST_STACK_CMP | stack_skip);
  --> 379                 if (pelem->stack_id < 0)

  ./tools/perf/util/bpf_skel/lock_contention.bpf.c:379 contention_begin()
  warn: unsigned 'pelem->stack_id' is never less than zero.

Let's change the type to s32 instead.

Fixes: 6d499a6b3d90277d ("perf lock: Print the number of lost entries for BPF")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240812172533.2015291-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/lock_data.h       | 4 ++--
 tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index 36af11faad03..de12892f992f 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -7,11 +7,11 @@ struct tstamp_data {
 	u64 timestamp;
 	u64 lock;
 	u32 flags;
-	u32 stack_id;
+	s32 stack_id;
 };
 
 struct contention_key {
-	u32 stack_id;
+	s32 stack_id;
 	u32 pid;
 	u64 lock_addr_or_cgroup;
 };
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
index e9028235d771..d818e30c5457 100644
--- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -15,6 +15,7 @@
 
 typedef __u8 u8;
 typedef __u32 u32;
+typedef __s32 s32;
 typedef __u64 u64;
 typedef __s64 s64;
 
-- 
cgit v1.2.3


From 7a75c6c23a2ea8dd22d90805b3a42bd65c53830e Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 10 Aug 2024 21:20:04 -0700
Subject: perf vendor events: SKX, CLX, SNR uncore cache event fixes

Cache home agent (CHA) events were setting the low rather than high
config1 bits. SNR was using CLX CHA events, however its CHA is similar
to ICX so remove the events.

Incorporate the updates in:

  https://github.com/intel/perfmon/pull/215
  https://github.com/intel/perfmon/pull/216

Fixes: 4cc49942444e958b ("perf vendor events: Update cascadelakex events/metrics")
Closes: https://lore.kernel.org/linux-perf-users/CAPhsuW4nem9XZP+b=sJJ7kqXG-cafz0djZf51HsgjCiwkGBA+A@mail.gmail.com/
Reported-by: Song Liu <song@kernel.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Co-authored-by: Weilin Wang <weilin.wang@intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240811042004.421869-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../arch/x86/cascadelakex/uncore-cache.json        | 60 +++++++++++-----------
 .../pmu-events/arch/x86/skylakex/uncore-cache.json | 60 +++++++++++-----------
 .../arch/x86/snowridgex/uncore-cache.json          | 57 --------------------
 3 files changed, 60 insertions(+), 117 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-cache.json
index c9596e18ec09..6347eba48810 100644
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-cache.json
@@ -4577,7 +4577,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD",
-        "Filter": "config1=0x40233",
+        "Filter": "config1=0x4023300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4588,7 +4588,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD",
-        "Filter": "config1=0x40433",
+        "Filter": "config1=0x4043300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : DRds issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4599,7 +4599,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefCRD",
-        "Filter": "config1=0x4b233",
+        "Filter": "config1=0x4b23300000000",
         "PerPkg": "1",
         "UMask": "0x11",
         "Unit": "CHA"
@@ -4609,7 +4609,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefDRD",
-        "Filter": "config1=0x4b433",
+        "Filter": "config1=0x4b43300000000",
         "PerPkg": "1",
         "UMask": "0x11",
         "Unit": "CHA"
@@ -4619,7 +4619,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefRFO",
-        "Filter": "config1=0x4b033",
+        "Filter": "config1=0x4b03300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4630,7 +4630,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4651,7 +4651,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD",
-        "Filter": "config1=0x40233",
+        "Filter": "config1=0x4023300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4662,7 +4662,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD",
-        "Filter": "config1=0x40433",
+        "Filter": "config1=0x4043300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : DRds issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4673,7 +4673,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefCRD",
-        "Filter": "config1=0x4b233",
+        "Filter": "config1=0x4b23300000000",
         "PerPkg": "1",
         "UMask": "0x21",
         "Unit": "CHA"
@@ -4683,7 +4683,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefDRD",
-        "Filter": "config1=0x4b433",
+        "Filter": "config1=0x4b43300000000",
         "PerPkg": "1",
         "UMask": "0x21",
         "Unit": "CHA"
@@ -4693,7 +4693,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefRFO",
-        "Filter": "config1=0x4b033",
+        "Filter": "config1=0x4b03300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4704,7 +4704,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4747,7 +4747,7 @@
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM",
         "Experimental": "1",
-        "Filter": "config1=0x49033",
+        "Filter": "config1=0x4903300000000",
         "PerPkg": "1",
         "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO ItoM requests that miss the LLC. An ItoM request is used by IIO to request a data write without first reading the data for ownership.",
         "UMask": "0x24",
@@ -4759,7 +4759,7 @@
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RDCUR",
         "Experimental": "1",
-        "Filter": "config1=0x43C33",
+        "Filter": "config1=0x43c3300000000",
         "PerPkg": "1",
         "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO RdCur requests and miss the LLC. A RdCur request is used by IIO to read data without changing state.",
         "UMask": "0x24",
@@ -4771,7 +4771,7 @@
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RFO",
         "Experimental": "1",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO RFO requests that miss the LLC. A read for ownership (RFO) requests a cache line to be cached in E state with the intent to modify.",
         "UMask": "0x24",
@@ -4999,7 +4999,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD",
-        "Filter": "config1=0x40233",
+        "Filter": "config1=0x4023300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -5010,7 +5010,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD",
-        "Filter": "config1=0x40433",
+        "Filter": "config1=0x4043300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : DRds issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -5021,7 +5021,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefCRD",
-        "Filter": "config1=0x4b233",
+        "Filter": "config1=0x4b23300000000",
         "PerPkg": "1",
         "UMask": "0x11",
         "Unit": "CHA"
@@ -5031,7 +5031,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefDRD",
-        "Filter": "config1=0x4b433",
+        "Filter": "config1=0x4b43300000000",
         "PerPkg": "1",
         "UMask": "0x11",
         "Unit": "CHA"
@@ -5041,7 +5041,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefRFO",
-        "Filter": "config1=0x4b033",
+        "Filter": "config1=0x4b03300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -5052,7 +5052,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -5073,7 +5073,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD",
-        "Filter": "config1=0x40233",
+        "Filter": "config1=0x4023300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -5084,7 +5084,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD",
-        "Filter": "config1=0x40433",
+        "Filter": "config1=0x4043300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : DRds issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -5095,7 +5095,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefCRD",
-        "Filter": "config1=0x4b233",
+        "Filter": "config1=0x4b23300000000",
         "PerPkg": "1",
         "UMask": "0x21",
         "Unit": "CHA"
@@ -5105,7 +5105,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefDRD",
-        "Filter": "config1=0x4b433",
+        "Filter": "config1=0x4b43300000000",
         "PerPkg": "1",
         "UMask": "0x21",
         "Unit": "CHA"
@@ -5115,7 +5115,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefRFO",
-        "Filter": "config1=0x4b033",
+        "Filter": "config1=0x4b03300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -5126,7 +5126,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -5171,7 +5171,7 @@
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM",
         "Experimental": "1",
-        "Filter": "config1=0x49033",
+        "Filter": "config1=0x4903300000000",
         "PerPkg": "1",
         "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO ItoM requests that miss the LLC. An ItoM is used by IIO to request a data write without first reading the data for ownership.",
         "UMask": "0x24",
@@ -5183,7 +5183,7 @@
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RDCUR",
         "Experimental": "1",
-        "Filter": "config1=0x43C33",
+        "Filter": "config1=0x43c3300000000",
         "PerPkg": "1",
         "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO RdCur requests that miss the LLC. A RdCur request is used by IIO to read data without changing state.",
         "UMask": "0x24",
@@ -5195,7 +5195,7 @@
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RFO",
         "Experimental": "1",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO RFO requests that miss the LLC. A read for ownership (RFO) requests data to be cached in E state with the intent to modify.",
         "UMask": "0x24",
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-cache.json
index da46a3aeb58c..4fc818626491 100644
--- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-cache.json
@@ -4454,7 +4454,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD",
-        "Filter": "config1=0x40233",
+        "Filter": "config1=0x4023300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4465,7 +4465,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD",
-        "Filter": "config1=0x40433",
+        "Filter": "config1=0x4043300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : DRds issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4476,7 +4476,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefCRD",
-        "Filter": "config1=0x4b233",
+        "Filter": "config1=0x4b23300000000",
         "PerPkg": "1",
         "UMask": "0x11",
         "Unit": "CHA"
@@ -4486,7 +4486,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefDRD",
-        "Filter": "config1=0x4b433",
+        "Filter": "config1=0x4b43300000000",
         "PerPkg": "1",
         "UMask": "0x11",
         "Unit": "CHA"
@@ -4496,7 +4496,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefRFO",
-        "Filter": "config1=0x4b033",
+        "Filter": "config1=0x4b03300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4507,7 +4507,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4528,7 +4528,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD",
-        "Filter": "config1=0x40233",
+        "Filter": "config1=0x4023300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4539,7 +4539,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD",
-        "Filter": "config1=0x40433",
+        "Filter": "config1=0x4043300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : DRds issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4550,7 +4550,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefCRD",
-        "Filter": "config1=0x4b233",
+        "Filter": "config1=0x4b23300000000",
         "PerPkg": "1",
         "UMask": "0x21",
         "Unit": "CHA"
@@ -4560,7 +4560,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefDRD",
-        "Filter": "config1=0x4b433",
+        "Filter": "config1=0x4b43300000000",
         "PerPkg": "1",
         "UMask": "0x21",
         "Unit": "CHA"
@@ -4570,7 +4570,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefRFO",
-        "Filter": "config1=0x4b033",
+        "Filter": "config1=0x4b03300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4581,7 +4581,7 @@
         "Counter": "0,1,2,3",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4624,7 +4624,7 @@
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM",
         "Experimental": "1",
-        "Filter": "config1=0x49033",
+        "Filter": "config1=0x4903300000000",
         "PerPkg": "1",
         "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO ItoM requests that miss the LLC. An ItoM request is used by IIO to request a data write without first reading the data for ownership.",
         "UMask": "0x24",
@@ -4636,7 +4636,7 @@
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RDCUR",
         "Experimental": "1",
-        "Filter": "config1=0x43C33",
+        "Filter": "config1=0x43c3300000000",
         "PerPkg": "1",
         "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO RdCur requests and miss the LLC. A RdCur request is used by IIO to read data without changing state.",
         "UMask": "0x24",
@@ -4648,7 +4648,7 @@
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RFO",
         "Experimental": "1",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO RFO requests that miss the LLC. A read for ownership (RFO) requests a cache line to be cached in E state with the intent to modify.",
         "UMask": "0x24",
@@ -4865,7 +4865,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD",
-        "Filter": "config1=0x40233",
+        "Filter": "config1=0x4023300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4876,7 +4876,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD",
-        "Filter": "config1=0x40433",
+        "Filter": "config1=0x4043300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : DRds issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4887,7 +4887,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefCRD",
-        "Filter": "config1=0x4b233",
+        "Filter": "config1=0x4b23300000000",
         "PerPkg": "1",
         "UMask": "0x11",
         "Unit": "CHA"
@@ -4897,7 +4897,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefDRD",
-        "Filter": "config1=0x4b433",
+        "Filter": "config1=0x4b43300000000",
         "PerPkg": "1",
         "UMask": "0x11",
         "Unit": "CHA"
@@ -4907,7 +4907,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefRFO",
-        "Filter": "config1=0x4b033",
+        "Filter": "config1=0x4b03300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4918,7 +4918,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x11",
@@ -4939,7 +4939,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD",
-        "Filter": "config1=0x40233",
+        "Filter": "config1=0x4023300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4950,7 +4950,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD",
-        "Filter": "config1=0x40433",
+        "Filter": "config1=0x4043300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : DRds issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4961,7 +4961,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefCRD",
-        "Filter": "config1=0x4b233",
+        "Filter": "config1=0x4b23300000000",
         "PerPkg": "1",
         "UMask": "0x21",
         "Unit": "CHA"
@@ -4971,7 +4971,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefDRD",
-        "Filter": "config1=0x4b433",
+        "Filter": "config1=0x4b43300000000",
         "PerPkg": "1",
         "UMask": "0x21",
         "Unit": "CHA"
@@ -4981,7 +4981,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefRFO",
-        "Filter": "config1=0x4b033",
+        "Filter": "config1=0x4b03300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -4992,7 +4992,7 @@
         "Counter": "0",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent.     Does not include addressless requests such as locks and interrupts.",
         "UMask": "0x21",
@@ -5037,7 +5037,7 @@
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM",
         "Experimental": "1",
-        "Filter": "config1=0x49033",
+        "Filter": "config1=0x4903300000000",
         "PerPkg": "1",
         "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO ItoM requests that miss the LLC. An ItoM is used by IIO to request a data write without first reading the data for ownership.",
         "UMask": "0x24",
@@ -5049,7 +5049,7 @@
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RDCUR",
         "Experimental": "1",
-        "Filter": "config1=0x43C33",
+        "Filter": "config1=0x43c3300000000",
         "PerPkg": "1",
         "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO RdCur requests that miss the LLC. A RdCur request is used by IIO to read data without changing state.",
         "UMask": "0x24",
@@ -5061,7 +5061,7 @@
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RFO",
         "Experimental": "1",
-        "Filter": "config1=0x40033",
+        "Filter": "config1=0x4003300000000",
         "PerPkg": "1",
         "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO RFO requests that miss the LLC. A read for ownership (RFO) requests data to be cached in E state with the intent to modify.",
         "UMask": "0x24",
diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json
index 7551fb91a9d7..a81776deb2e6 100644
--- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json
@@ -1,61 +1,4 @@
 [
-    {
-        "BriefDescription": "MMIO reads. Derived from unc_cha_tor_inserts.ia_miss",
-        "Counter": "0,1,2,3",
-        "EventCode": "0x35",
-        "EventName": "LLC_MISSES.MMIO_READ",
-        "Filter": "config1=0x40040e33",
-        "PerPkg": "1",
-        "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
-        "UMask": "0xc001fe01",
-        "Unit": "CHA"
-    },
-    {
-        "BriefDescription": "MMIO writes. Derived from unc_cha_tor_inserts.ia_miss",
-        "Counter": "0,1,2,3",
-        "EventCode": "0x35",
-        "EventName": "LLC_MISSES.MMIO_WRITE",
-        "Filter": "config1=0x40041e33",
-        "PerPkg": "1",
-        "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
-        "UMask": "0xc001fe01",
-        "Unit": "CHA"
-    },
-    {
-        "BriefDescription": "LLC misses - Uncacheable reads (from cpu) . Derived from unc_cha_tor_inserts.ia_miss",
-        "Counter": "0,1,2,3",
-        "EventCode": "0x35",
-        "EventName": "LLC_MISSES.UNCACHEABLE",
-        "Filter": "config1=0x40e33",
-        "PerPkg": "1",
-        "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
-        "UMask": "0xc001fe01",
-        "Unit": "CHA"
-    },
-    {
-        "BriefDescription": "Streaming stores (full cache line). Derived from unc_cha_tor_inserts.ia_miss",
-        "Counter": "0,1,2,3",
-        "EventCode": "0x35",
-        "EventName": "LLC_REFERENCES.STREAMING_FULL",
-        "Filter": "config1=0x41833",
-        "PerPkg": "1",
-        "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
-        "ScaleUnit": "64Bytes",
-        "UMask": "0xc001fe01",
-        "Unit": "CHA"
-    },
-    {
-        "BriefDescription": "Streaming stores (partial cache line). Derived from unc_cha_tor_inserts.ia_miss",
-        "Counter": "0,1,2,3",
-        "EventCode": "0x35",
-        "EventName": "LLC_REFERENCES.STREAMING_PARTIAL",
-        "Filter": "config1=0x41a33",
-        "PerPkg": "1",
-        "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent.   Does not include addressless requests such as locks and interrupts.",
-        "ScaleUnit": "64Bytes",
-        "UMask": "0xc001fe01",
-        "Unit": "CHA"
-    },
     {
         "BriefDescription": "CMS Agent0 AD Credits Acquired : For Transgress 0",
         "Counter": "0,1,2,3",
-- 
cgit v1.2.3


From 05fc5b7de395c82a17871e441a551566a49c1ea8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 12 Aug 2024 12:44:45 -0700
Subject: perf annotate-data: Support folding in TUI browser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Like in the hists browser, it should support folding current entry so
that it can hide unwanted details in some data structures.

The folded entries will be displayed with the '+' sign, while unfolded
entries will have the '-' sign.

Entries that have no children will not show any signs.

  Annotate type: 'struct socket' (1 samples)
        Percent     Offset       Size  Field
  -      100.00          0        128  struct socket {                                  ◆
           0.00          0          4      socket_state   state;                        ▒
           0.00          4          2      short int      type;                         ▒
           0.00          8          8      long unsigned int      flags;                ▒
           0.00         16          8      struct file*   file;                         ▒
         100.00         24          8      struct sock*   sk;                           ▒
           0.00         32          8      struct proto_ops*      ops;                  ▒
  -        0.00         64         64      struct socket_wq       wq {                  ▒
  -        0.00         64         24          wait_queue_head_t  wait {                ▒
  +        0.00         64          4              spinlock_t     lock;                 ▒
  -        0.00         72         16              struct list_head       head {        ▒
           0.00         72          8                  struct list_head*  next;         ▒
           0.00         80          8                  struct list_head*  prev;         ▒
                                                   };                                   ▒
                                               };                                       ▒
           0.00         88          8          struct fasync_struct*      fasync_list;  ▒
           0.00         96          8          long unsigned int  flags;                ▒
  +        0.00        104         16          struct callback_head       rcu;          ▒
                                           };                                           ▒
                                       };                                               ▒

This just adds the display logic for folding, actually folding action
will be implemented in the next patch.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240812194447.2049187-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate-data.c | 235 +++++++++++++++++++++++++++++----
 1 file changed, 212 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c
index a937b55da736..04c73b67cd6c 100644
--- a/tools/perf/ui/browsers/annotate-data.c
+++ b/tools/perf/ui/browsers/annotate-data.c
@@ -14,6 +14,10 @@
 #include "util/evlist.h"
 #include "util/sort.h"
 
+#define FOLDED_SIGN  '+'
+#define UNFOLD_SIGN  '-'
+#define NOCHLD_SIGN  ' '
+
 struct annotated_data_browser {
 	struct ui_browser b;
 	struct list_head entries;
@@ -24,7 +28,11 @@ struct browser_entry {
 	struct list_head node;
 	struct annotated_member *data;
 	struct type_hist_entry *hists;
-	int indent;
+	struct browser_entry *parent;
+	struct list_head children;
+	int indent;  /*indentation level, starts from 0 */
+	int nr_entries; /* # of visible entries: self + descendents */
+	bool folded;  /* only can be false when it has children */
 };
 
 static struct annotated_data_browser *get_browser(struct ui_browser *uib)
@@ -65,13 +73,14 @@ static int get_member_overhead(struct annotated_data_type *adt,
 }
 
 static int add_child_entries(struct annotated_data_browser *browser,
+			     struct browser_entry *parent,
 			     struct annotated_data_type *adt,
 			     struct annotated_member *member,
 			     struct evsel *evsel, int indent)
 {
 	struct annotated_member *pos;
 	struct browser_entry *entry;
-	int nr_entries = 0;
+	struct list_head *parent_list;
 
 	entry = zalloc(sizeof(*entry));
 	if (entry == NULL)
@@ -84,36 +93,60 @@ static int add_child_entries(struct annotated_data_browser *browser,
 	}
 
 	entry->data = member;
+	entry->parent = parent;
 	entry->indent = indent;
 	if (get_member_overhead(adt, entry, evsel) < 0) {
 		free(entry);
 		return -1;
 	}
 
-	list_add_tail(&entry->node, &browser->entries);
-	nr_entries++;
+	INIT_LIST_HEAD(&entry->children);
+	if (parent)
+		parent_list = &parent->children;
+	else
+		parent_list = &browser->entries;
 
-	list_for_each_entry(pos, &member->children, node) {
-		int nr = add_child_entries(browser, adt, pos, evsel, indent + 1);
+	list_add_tail(&entry->node, parent_list);
 
+	list_for_each_entry(pos, &member->children, node) {
+		int nr = add_child_entries(browser, entry, adt, pos, evsel,
+					   indent + 1);
 		if (nr < 0)
 			return nr;
-
-		nr_entries += nr;
 	}
 
 	/* add an entry for the closing bracket ("}") */
 	if (!list_empty(&member->children)) {
-		entry = zalloc(sizeof(*entry));
-		if (entry == NULL)
+		struct browser_entry *bracket;
+
+		bracket = zalloc(sizeof(*bracket));
+		if (bracket == NULL)
 			return -1;
 
-		entry->indent = indent;
-		list_add_tail(&entry->node, &browser->entries);
-		nr_entries++;
+		bracket->indent = indent;
+		bracket->parent = entry;
+		bracket->folded = true;
+		bracket->nr_entries = 1;
+
+		INIT_LIST_HEAD(&bracket->children);
+		list_add_tail(&bracket->node, &entry->children);
 	}
 
-	return nr_entries;
+	/* fold child entries by default */
+	entry->folded = true;
+	entry->nr_entries = 1;
+	return 0;
+}
+
+static u32 count_visible_entries(struct annotated_data_browser *browser)
+{
+	int nr = 0;
+	struct browser_entry *entry;
+
+	list_for_each_entry(entry, &browser->entries, node)
+		nr += entry->nr_entries;
+
+	return nr;
 }
 
 static int annotated_data_browser__collect_entries(struct annotated_data_browser *browser)
@@ -123,9 +156,12 @@ static int annotated_data_browser__collect_entries(struct annotated_data_browser
 	struct evsel *evsel = hists_to_evsel(he->hists);
 
 	INIT_LIST_HEAD(&browser->entries);
+
+	add_child_entries(browser, /*parent=*/NULL, adt, &adt->self, evsel,
+			  /*indent=*/0);
+
 	browser->b.entries = &browser->entries;
-	browser->b.nr_entries = add_child_entries(browser, adt, &adt->self,
-						  evsel, /*indent=*/0);
+	browser->b.nr_entries = count_visible_entries(browser);
 	return 0;
 }
 
@@ -140,9 +176,155 @@ static void annotated_data_browser__delete_entries(struct annotated_data_browser
 	}
 }
 
+static struct browser_entry *get_first_child(struct browser_entry *entry)
+{
+	if (list_empty(&entry->children))
+		return NULL;
+
+	return list_first_entry(&entry->children, struct browser_entry, node);
+}
+
+static struct browser_entry *get_last_child(struct browser_entry *entry)
+{
+	if (list_empty(&entry->children))
+		return NULL;
+
+	return list_last_entry(&entry->children, struct browser_entry, node);
+}
+
+static bool is_first_child(struct browser_entry *entry)
+{
+	/* This will be checked in a different way */
+	if (entry->parent == NULL)
+		return false;
+
+	return get_first_child(entry->parent) == entry;
+}
+
+static bool is_last_child(struct browser_entry *entry)
+{
+	/* This will be checked in a different way */
+	if (entry->parent == NULL)
+		return false;
+
+	return get_last_child(entry->parent) == entry;
+}
+
+static struct browser_entry *browser__prev_entry(struct ui_browser *uib,
+						 struct browser_entry *entry)
+{
+	struct annotated_data_browser *browser = get_browser(uib);
+	struct browser_entry *first;
+
+	first = list_first_entry(&browser->entries, struct browser_entry, node);
+
+	while (entry != first) {
+		if (is_first_child(entry))
+			entry = entry->parent;
+		else {
+			entry = list_prev_entry(entry, node);
+			while (!entry->folded)
+				entry = get_last_child(entry);
+		}
+
+		if (!uib->filter || !uib->filter(uib, &entry->node))
+			return entry;
+	}
+	return first;
+}
+
+static struct browser_entry *browser__next_entry(struct ui_browser *uib,
+						 struct browser_entry *entry)
+{
+	struct annotated_data_browser *browser = get_browser(uib);
+	struct browser_entry *last;
+
+	last = list_last_entry(&browser->entries, struct browser_entry, node);
+	while (!last->folded)
+		last = get_last_child(last);
+
+	while (entry != last) {
+		if (!entry->folded)
+			entry = get_first_child(entry);
+		else {
+			while (is_last_child(entry))
+				entry = entry->parent;
+
+			entry = list_next_entry(entry, node);
+		}
+
+		if (!uib->filter || !uib->filter(uib, &entry->node))
+			return entry;
+	}
+	return last;
+}
+
+static void browser__seek(struct ui_browser *uib, off_t offset, int whence)
+{
+	struct annotated_data_browser *browser = get_browser(uib);
+	struct browser_entry *entry;
+
+	if (uib->nr_entries == 0)
+		return;
+
+	switch (whence) {
+	case SEEK_SET:
+		entry = list_first_entry(&browser->entries, typeof(*entry), node);
+		if (uib->filter && uib->filter(uib, &entry->node))
+			entry = browser__next_entry(uib, entry);
+		break;
+	case SEEK_CUR:
+		entry = list_entry(uib->top, typeof(*entry), node);
+		break;
+	case SEEK_END:
+		entry = list_last_entry(&browser->entries, typeof(*entry), node);
+		while (!entry->folded)
+			entry = get_last_child(entry);
+		if (uib->filter && uib->filter(uib, &entry->node))
+			entry = browser__prev_entry(uib, entry);
+		break;
+	default:
+		return;
+	}
+
+	assert(entry != NULL);
+
+	if (offset > 0) {
+		while (offset-- != 0)
+			entry = browser__next_entry(uib, entry);
+	} else {
+		while (offset++ != 0)
+			entry = browser__prev_entry(uib, entry);
+	}
+
+	uib->top = &entry->node;
+}
+
 static unsigned int browser__refresh(struct ui_browser *uib)
 {
-	return ui_browser__list_head_refresh(uib);
+	struct browser_entry *entry, *next;
+	int row = 0;
+
+	if (uib->top == NULL || uib->top == uib->entries)
+		browser__seek(uib, SEEK_SET, 0);
+
+	entry = list_entry(uib->top, typeof(*entry), node);
+
+	while (true) {
+		if (!uib->filter || !uib->filter(uib, &entry->node)) {
+			ui_browser__gotorc(uib, row, 0);
+			uib->write(uib, entry, row);
+			if (++row == uib->rows)
+				break;
+		}
+		next = browser__next_entry(uib, entry);
+		if (next == entry)
+			break;
+
+		entry = next;
+	}
+
+	return row;
 }
 
 static int browser__show(struct ui_browser *uib)
@@ -171,7 +353,7 @@ static int browser__show(struct ui_browser *uib)
 		strcpy(title, "Percent");
 
 	ui_browser__printf(uib, "%*s %10s %10s %10s  %s",
-			   11 * (browser->nr_events - 1), "",
+			   2 + 11 * (browser->nr_events - 1), "",
 			   title, "Offset", "Size", "Field");
 	ui_browser__write_nstring(uib, "", uib->width);
 	return 0;
@@ -208,12 +390,12 @@ static void browser__write(struct ui_browser *uib, void *entry, int row)
 	struct evsel *leader = hists_to_evsel(he->hists);
 	struct evsel *evsel;
 	int idx = 0;
+	bool current = ui_browser__is_current_entry(uib, row);
 
 	if (member == NULL) {
-		bool current = ui_browser__is_current_entry(uib, row);
-
 		/* print the closing bracket */
 		ui_browser__set_percent_color(uib, 0, current);
+		ui_browser__printf(uib, "%c ", NOCHLD_SIGN);
 		ui_browser__write_nstring(uib, "", 11 * browser->nr_events);
 		ui_browser__printf(uib, " %10s %10s  %*s};",
 				   "", "", be->indent * 4, "");
@@ -221,6 +403,13 @@ static void browser__write(struct ui_browser *uib, void *entry, int row)
 		return;
 	}
 
+	ui_browser__set_percent_color(uib, 0, current);
+
+	if (!list_empty(&be->children))
+		ui_browser__printf(uib, "%c ", be->folded ? FOLDED_SIGN : UNFOLD_SIGN);
+	else
+		ui_browser__printf(uib, "%c ", NOCHLD_SIGN);
+
 	/* print the number */
 	for_each_group_evsel(evsel, leader) {
 		struct type_hist *h = adt->histograms[evsel->core.idx];
@@ -237,13 +426,13 @@ static void browser__write(struct ui_browser *uib, void *entry, int row)
 		ui_browser__printf(uib, " %10d %10d  %s%s",
 				   member->offset, member->size,
 				   member->type_name,
-				   list_empty(&member->children) ? ";" : " {");
+				   list_empty(&member->children) || be->folded? ";" : " {");
 	} else {
 		ui_browser__printf(uib, " %10d %10d  %*s%s\t%s%s",
 				   member->offset, member->size,
 				   be->indent * 4, "", member->type_name,
 				   member->var_name ?: "",
-				   list_empty(&member->children) ? ";" : " {");
+				   list_empty(&member->children) || be->folded ? ";" : " {");
 	}
 	/* fill the rest */
 	ui_browser__write_nstring(uib, "", uib->width);
@@ -297,7 +486,7 @@ int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel,
 	struct annotated_data_browser browser = {
 		.b = {
 			.refresh = browser__refresh,
-			.seek	 = ui_browser__list_head_seek,
+			.seek	 = browser__seek,
 			.write	 = browser__write,
 			.priv	 = he,
 			.extra_title_lines = 1,
-- 
cgit v1.2.3


From af73856e9ac818bfe8e2f95b32734acbcabe6689 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 12 Aug 2024 12:44:46 -0700
Subject: perf annotate-data: Implement folding in TUI browser

Like 'perf report', use 'e' or 'E' key to toggle folding the current
entry so that it can control displaying child entries.

Note I didn't add the 'c' and 'C' key to collapse the entry because it's
also handled with the 'e'/'E' since it toggles the state.

Committer testing:

Do some 'perf mem record' for some workload of the whole system, using
the target options, as usual (--pid/-p, -C/--cpu, -a for the system wide
profiling, etc) and then:

  # perf annotate --skip-empty --data-type=pthread_mutex_t

That, by default, will start as --tui, then press 'E' to see the whole
struct unfolded, etc.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240812194447.2049187-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate-data.c | 98 +++++++++++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c
index 04c73b67cd6c..a5c5ad63425e 100644
--- a/tools/perf/ui/browsers/annotate-data.c
+++ b/tools/perf/ui/browsers/annotate-data.c
@@ -18,12 +18,6 @@
 #define UNFOLD_SIGN  '-'
 #define NOCHLD_SIGN  ' '
 
-struct annotated_data_browser {
-	struct ui_browser b;
-	struct list_head entries;
-	int nr_events;
-};
-
 struct browser_entry {
 	struct list_head node;
 	struct annotated_member *data;
@@ -35,6 +29,13 @@ struct browser_entry {
 	bool folded;  /* only can be false when it has children */
 };
 
+struct annotated_data_browser {
+	struct ui_browser b;
+	struct list_head entries;
+	struct browser_entry *curr;
+	int nr_events;
+};
+
 static struct annotated_data_browser *get_browser(struct ui_browser *uib)
 {
 	return container_of(uib, struct annotated_data_browser, b);
@@ -302,6 +303,7 @@ static void browser__seek(struct ui_browser *uib, off_t offset, int whence)
 
 static unsigned int browser__refresh(struct ui_browser *uib)
 {
+	struct annotated_data_browser *browser = get_browser(uib);
 	struct browser_entry *entry, *next;
 	int row = 0;
 
@@ -314,6 +316,8 @@ static unsigned int browser__refresh(struct ui_browser *uib)
 		if (!uib->filter || !uib->filter(uib, &entry->node)) {
 			ui_browser__gotorc(uib, row, 0);
 			uib->write(uib, entry, row);
+			if (uib->top_idx + row == uib->index)
+				browser->curr = entry;
 			if (++row == uib->rows)
 				break;
 		}
@@ -438,6 +442,78 @@ static void browser__write(struct ui_browser *uib, void *entry, int row)
 	ui_browser__write_nstring(uib, "", uib->width);
 }
 
+static void annotated_data_browser__fold(struct annotated_data_browser *browser,
+					 struct browser_entry *entry,
+					 bool recursive)
+{
+	struct browser_entry *child;
+
+	if (list_empty(&entry->children))
+		return;
+	if (entry->folded && !recursive)
+		return;
+
+	if (recursive) {
+		list_for_each_entry(child, &entry->children, node)
+			annotated_data_browser__fold(browser, child, true);
+	}
+
+	entry->nr_entries = 1;
+	entry->folded = true;
+}
+
+static void annotated_data_browser__unfold(struct annotated_data_browser *browser,
+					   struct browser_entry *entry,
+					   bool recursive)
+{
+	struct browser_entry *child;
+	int nr_entries;
+
+	if (list_empty(&entry->children))
+		return;
+	if (!entry->folded && !recursive)
+		return;
+
+	nr_entries = 1; /* for self */
+	list_for_each_entry(child, &entry->children, node) {
+		if (recursive)
+			annotated_data_browser__unfold(browser, child, true);
+
+		nr_entries += child->nr_entries;
+	}
+
+	entry->nr_entries = nr_entries;
+	entry->folded = false;
+}
+
+static void annotated_data_browser__toggle_fold(struct annotated_data_browser *browser,
+						bool recursive)
+{
+	struct browser_entry *curr = browser->curr;
+	struct browser_entry *parent;
+
+	parent = curr->parent;
+	while (parent) {
+		parent->nr_entries -= curr->nr_entries;
+		parent = parent->parent;
+	}
+	browser->b.nr_entries -= curr->nr_entries;
+
+	if (curr->folded)
+		annotated_data_browser__unfold(browser, curr, recursive);
+	else
+		annotated_data_browser__fold(browser, curr, recursive);
+
+	parent = curr->parent;
+	while (parent) {
+		parent->nr_entries += curr->nr_entries;
+		parent = parent->parent;
+	}
+	browser->b.nr_entries += curr->nr_entries;
+
+	assert(browser->b.nr_entries == count_visible_entries(browser));
+}
+
 static int annotated_data_browser__run(struct annotated_data_browser *browser,
 				       struct evsel *evsel __maybe_unused,
 				       struct hist_browser_timer *hbt)
@@ -462,8 +538,18 @@ static int annotated_data_browser__run(struct annotated_data_browser *browser,
 		"UP/DOWN/PGUP\n"
 		"PGDN/SPACE    Navigate\n"
 		"</>           Move to prev/next symbol\n"
+		"e             Expand/Collapse current entry\n"
+		"E             Expand/Collapse all children of the current\n"
 		"q/ESC/CTRL+C  Exit\n\n");
 			continue;
+		case 'e':
+			annotated_data_browser__toggle_fold(browser,
+							    /*recursive=*/false);
+			break;
+		case 'E':
+			annotated_data_browser__toggle_fold(browser,
+							    /*recursive=*/true);
+			break;
 		case K_LEFT:
 		case '<':
 		case '>':
-- 
cgit v1.2.3


From 7f3c8f13ad93044ab1d18949b8b840fed7c59c30 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 12 Aug 2024 12:44:47 -0700
Subject: perf annotate-data: Show first-level children by default in TUI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now default is to fold everything but it only shows the name of the
top-level data type which is not very useful.  Instead just expand the
top level entry so that it can show the layout at a higher level.

  Annotate type: 'struct task_struct' (4 samples)
        Percent     Offset       Size  Field
  -      100.00          0       9792  struct task_struct {                           ◆
  +        0.50          0         24      struct thread_info     thread_info;        ▒
           0.00         24          4      unsigned int   __state;                    ▒
           0.00         32          8      void*  stack;                              ▒
  +        0.00         40          4      refcount_t     usage;                      ▒
           0.00         44          4      unsigned int   flags;                      ▒
           0.00         48          4      unsigned int   ptrace;                     ▒
           0.00         52          4      int    on_cpu;                             ▒
  +        0.00         56         16      struct __call_single_node      wake_entry; ▒
           0.00         72          4      unsigned int   wakee_flips;                ▒
           0.00         80          8      long unsigned int      wakee_flip_decay_ts;▒
           0.00         88          8      struct task_struct*    last_wakee;         ▒
           0.00         96          4      int    recent_used_cpu;                    ▒
           0.00        100          4      int    wake_cpu;                           ▒
           0.00        104          4      int    on_rq;                              ▒
           0.00        108          4      int    prio;                               ▒
           0.00        112          4      int    static_prio;                        ▒
           0.00        116          4      int    normal_prio;                        ▒
           0.00        120          4      unsigned int   rt_priority;                ▒
  +        0.00        128        256      struct sched_entity    se;                 ▒
  +        0.00        384         48      struct sched_rt_entity rt;                 ▒
  +        0.00        432        224      struct sched_dl_entity dl;                 ▒
           0.00        656          8      struct sched_class*    sched_class;        ▒
  ...

Committer testing:

  # perf mem record -a sleep 5s
  # perf annotate --group --data-type=pthread_mutex_t

 Annotate type: 'pthread_mutex_t' (13 samples)
      Percent     Offset       Size  Field
-      100.00          0         40  pthread_mutex_t {                                ▒
-      100.00          0         40      struct __pthread_mutex_s       __data {      ▒
        39.45          0          4          int        __lock;                       ▒
         0.00          4          4          unsigned int       __count;              ▒
         7.80          8          4          int        __owner;                      ▒
         6.88         12          4          unsigned int       __nusers;             ▒
        45.87         16          4          int        __kind;                       ▒
         0.00         20          2          short int  __spins;                      ▒
         0.00         22          2          short int  __elision;                    ▒
+        0.00         24         16          __pthread_list_t   __list;               ▒
                                         };                                           ▒
         0.00          0          0      char[] __size;                               ▒
        39.45          0          8      long int       __align;

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240812194447.2049187-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate-data.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c
index a5c5ad63425e..f563a3bb072c 100644
--- a/tools/perf/ui/browsers/annotate-data.c
+++ b/tools/perf/ui/browsers/annotate-data.c
@@ -596,9 +596,17 @@ int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel,
 	}
 
 	ret = annotated_data_browser__collect_entries(&browser);
-	if (ret == 0)
-		ret = annotated_data_browser__run(&browser, evsel, hbt);
+	if (ret < 0)
+		goto out;
 
+	/* To get the top and current entry */
+	browser__refresh(&browser.b);
+	/* Show the first-level child entries by default */
+	annotated_data_browser__toggle_fold(&browser, /*recursive=*/false);
+
+	ret = annotated_data_browser__run(&browser, evsel, hbt);
+
+out:
 	annotated_data_browser__delete_entries(&browser);
 
 	return ret;
-- 
cgit v1.2.3


From 79bcd34e0f3da39fda841406ccc957405e724852 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 29 Jul 2024 15:06:20 -0700
Subject: perf inject: Fix leader sampling inserting additional samples

The processing of leader samples would turn an individual sample with
a group of read values into multiple samples. 'perf inject' would pass
through the additional samples increasing the output data file size:

  $ perf record -g -e "{instructions,cycles}:S" -o perf.orig.data true
  $ perf script -D -i perf.orig.data | sed -e 's/perf.orig.data/perf.data/g' > orig.txt
  $ perf inject -i perf.orig.data -o perf.new.data
  $ perf script -D -i perf.new.data | sed -e 's/perf.new.data/perf.data/g' > new.txt
  $ diff -u orig.txt new.txt
  --- orig.txt    2024-07-29 14:29:40.606576769 -0700
  +++ new.txt     2024-07-29 14:30:04.142737434 -0700
  ...
  -0xc550@perf.data [0x30]: event: 3
  +0xc550@perf.data [0xd0]: event: 9
  +.
  +. ... raw event: size 208 bytes
  +.  0000:  09 00 00 00 01 00 d0 00 fc 72 01 86 ff ff ff ff  .........r......
  +.  0010:  74 7d 2c 00 74 7d 2c 00 fb c3 79 f9 ba d5 05 00  t},.t},...y.....
  +.  0020:  e6 cb 1a 00 00 00 00 00 01 00 00 00 00 00 00 00  ................
  +.  0030:  02 00 00 00 00 00 00 00 76 01 00 00 00 00 00 00  ........v.......
  +.  0040:  e6 cb 1a 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  +.  0050:  62 18 00 00 00 00 00 00 f6 cb 1a 00 00 00 00 00  b...............
  +.  0060:  00 00 00 00 00 00 00 00 0c 00 00 00 00 00 00 00  ................
  +.  0070:  80 ff ff ff ff ff ff ff fc 72 01 86 ff ff ff ff  .........r......
  +.  0080:  f3 0e 6e 85 ff ff ff ff 0c cb 7f 85 ff ff ff ff  ..n.............
  +.  0090:  bc f2 87 85 ff ff ff ff 44 af 7f 85 ff ff ff ff  ........D.......
  +.  00a0:  bd be 7f 85 ff ff ff ff 26 d0 7f 85 ff ff ff ff  ........&.......
  +.  00b0:  6d a4 ff 85 ff ff ff ff ea 00 20 86 ff ff ff ff  m......... .....
  +.  00c0:  00 fe ff ff ff ff ff ff 57 14 4f 43 fc 7e 00 00  ........W.OC.~..
  +
  +1642373909693435 0xc550 [0xd0]: PERF_RECORD_SAMPLE(IP, 0x1): 2915700/2915700: 0xffffffff860172fc period: 1 addr: 0
  +... FP chain: nr:12
  +.....  0: ffffffffffffff80
  +.....  1: ffffffff860172fc
  +.....  2: ffffffff856e0ef3
  +.....  3: ffffffff857fcb0c
  +.....  4: ffffffff8587f2bc
  +.....  5: ffffffff857faf44
  +.....  6: ffffffff857fbebd
  +.....  7: ffffffff857fd026
  +.....  8: ffffffff85ffa46d
  +.....  9: ffffffff862000ea
  +..... 10: fffffffffffffe00
  +..... 11: 00007efc434f1457
  +... sample_read:
  +.... group nr 2
  +..... id 00000000001acbe6, value 0000000000000176, lost 0
  +..... id 00000000001acbf6, value 0000000000001862, lost 0
  +
  +0xc620@perf.data [0x30]: event: 3
  ...

This behavior is incorrect as in the case above 'perf inject' should
have done nothing. Fix this behavior by disabling separating samples
for a tool that requests it. Only request this for `perf inject` so as
to not affect other perf tools. With the patch and the test above
there are no differences between the orig.txt and new.txt.

Fixes: e4caec0d1af3d608 ("perf evsel: Add PERF_SAMPLE_READ sample related processing")
Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240729220620.2957754-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 1 +
 tools/perf/util/session.c   | 3 +++
 tools/perf/util/tool.h      | 1 +
 3 files changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 7b4a5d56d279..8f3dd8980da0 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2204,6 +2204,7 @@ int cmd_inject(int argc, const char **argv)
 			.finished_init	= perf_event__repipe_op2_synth,
 			.compressed	= perf_event__repipe_op4_synth,
 			.auxtrace	= perf_event__repipe_auxtrace,
+			.dont_split_sample_group = true,
 		},
 		.input_name  = "-",
 		.samples = LIST_HEAD_INIT(inject.samples),
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index f9072e003367..453ca0297c80 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1513,6 +1513,9 @@ static int deliver_sample_group(struct evlist *evlist,
 	int ret = -EINVAL;
 	struct sample_read_value *v = sample->read.group.values;
 
+	if (tool->dont_split_sample_group)
+		return deliver_sample_value(evlist, tool, event, sample, v, machine);
+
 	sample_read_group__for_each(v, sample->read.group.nr, read_format) {
 		ret = deliver_sample_value(evlist, tool, event, sample, v,
 					   machine);
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index c957fb849ac6..62bbc9cec151 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -85,6 +85,7 @@ struct perf_tool {
 	bool		namespace_events;
 	bool		cgroup_events;
 	bool		no_warn;
+	bool		dont_split_sample_group;
 	enum show_feature_header show_feat_hdr;
 };
 
-- 
cgit v1.2.3


From 4e322c785514e85c5ede49c54d72795a5573fa79 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:46:53 -0700
Subject: perf auxtrace: Remove dummy tools

Add perf_session__deliver_synth_attr_event that synthesizes a
perf_record_header_attr event with one id. Remove use of
perf_event__synthesize_attr that necessitates the use of the dummy
tool in order to pass the session.

Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Adrian Hunter <adrian.hunter@intel.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/arm-spe.c   | 49 +++++++++------------------------------------
 tools/perf/util/cs-etm.c    | 33 ++----------------------------
 tools/perf/util/intel-bts.c | 31 +---------------------------
 tools/perf/util/intel-pt.c  | 24 +---------------------
 tools/perf/util/session.c   | 24 ++++++++++++++++++++++
 tools/perf/util/session.h   |  3 +++
 6 files changed, 41 insertions(+), 123 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index afbd5869f6bf..fa40f3cb6266 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -1073,35 +1073,6 @@ static void arm_spe_print_info(__u64 *arr)
 	fprintf(stdout, arm_spe_info_fmts[ARM_SPE_PMU_TYPE], arr[ARM_SPE_PMU_TYPE]);
 }
 
-struct arm_spe_synth {
-	struct perf_tool dummy_tool;
-	struct perf_session *session;
-};
-
-static int arm_spe_event_synth(struct perf_tool *tool,
-			       union perf_event *event,
-			       struct perf_sample *sample __maybe_unused,
-			       struct machine *machine __maybe_unused)
-{
-	struct arm_spe_synth *arm_spe_synth =
-		      container_of(tool, struct arm_spe_synth, dummy_tool);
-
-	return perf_session__deliver_synth_event(arm_spe_synth->session,
-						 event, NULL);
-}
-
-static int arm_spe_synth_event(struct perf_session *session,
-			       struct perf_event_attr *attr, u64 id)
-{
-	struct arm_spe_synth arm_spe_synth;
-
-	memset(&arm_spe_synth, 0, sizeof(struct arm_spe_synth));
-	arm_spe_synth.session = session;
-
-	return perf_event__synthesize_attr(&arm_spe_synth.dummy_tool, attr, 1,
-					   &id, arm_spe_event_synth);
-}
-
 static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
 				    const char *name)
 {
@@ -1172,7 +1143,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		spe->sample_flc = true;
 
 		/* Level 1 data cache miss */
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->l1d_miss_id = id;
@@ -1180,7 +1151,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		id += 1;
 
 		/* Level 1 data cache access */
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->l1d_access_id = id;
@@ -1192,7 +1163,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		spe->sample_llc = true;
 
 		/* Last level cache miss */
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->llc_miss_id = id;
@@ -1200,7 +1171,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		id += 1;
 
 		/* Last level cache access */
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->llc_access_id = id;
@@ -1212,7 +1183,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		spe->sample_tlb = true;
 
 		/* TLB miss */
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->tlb_miss_id = id;
@@ -1220,7 +1191,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		id += 1;
 
 		/* TLB access */
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->tlb_access_id = id;
@@ -1232,7 +1203,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		spe->sample_branch = true;
 
 		/* Branch miss */
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->branch_miss_id = id;
@@ -1244,7 +1215,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		spe->sample_remote_access = true;
 
 		/* Remote access */
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->remote_access_id = id;
@@ -1255,7 +1226,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 	if (spe->synth_opts.mem) {
 		spe->sample_memory = true;
 
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->memory_id = id;
@@ -1276,7 +1247,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 		attr.config = PERF_COUNT_HW_INSTRUCTIONS;
 		attr.sample_period = spe->synth_opts.period;
 		spe->instructions_sample_period = attr.sample_period;
-		err = arm_spe_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		spe->instructions_id = id;
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index d3e9c64d17d4..566498ebc694 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -1599,35 +1599,6 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
 	return ret;
 }
 
-struct cs_etm_synth {
-	struct perf_tool dummy_tool;
-	struct perf_session *session;
-};
-
-static int cs_etm__event_synth(struct perf_tool *tool,
-			       union perf_event *event,
-			       struct perf_sample *sample __maybe_unused,
-			       struct machine *machine __maybe_unused)
-{
-	struct cs_etm_synth *cs_etm_synth =
-		      container_of(tool, struct cs_etm_synth, dummy_tool);
-
-	return perf_session__deliver_synth_event(cs_etm_synth->session,
-						 event, NULL);
-}
-
-static int cs_etm__synth_event(struct perf_session *session,
-			       struct perf_event_attr *attr, u64 id)
-{
-	struct cs_etm_synth cs_etm_synth;
-
-	memset(&cs_etm_synth, 0, sizeof(struct cs_etm_synth));
-	cs_etm_synth.session = session;
-
-	return perf_event__synthesize_attr(&cs_etm_synth.dummy_tool, attr, 1,
-					   &id, cs_etm__event_synth);
-}
-
 static int cs_etm__synth_events(struct cs_etm_auxtrace *etm,
 				struct perf_session *session)
 {
@@ -1679,7 +1650,7 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm,
 		attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
 		attr.sample_period = 1;
 		attr.sample_type |= PERF_SAMPLE_ADDR;
-		err = cs_etm__synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		etm->branches_sample_type = attr.sample_type;
@@ -1702,7 +1673,7 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm,
 		attr.config = PERF_COUNT_HW_INSTRUCTIONS;
 		attr.sample_period = etm->synth_opts.period;
 		etm->instructions_sample_period = attr.sample_period;
-		err = cs_etm__synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err)
 			return err;
 		etm->instructions_sample_type = attr.sample_type;
diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index ec1b3bd9f530..160d346cb450 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c
@@ -737,35 +737,6 @@ static bool intel_bts_evsel_is_auxtrace(struct perf_session *session,
 	return evsel->core.attr.type == bts->pmu_type;
 }
 
-struct intel_bts_synth {
-	struct perf_tool dummy_tool;
-	struct perf_session *session;
-};
-
-static int intel_bts_event_synth(struct perf_tool *tool,
-				 union perf_event *event,
-				 struct perf_sample *sample __maybe_unused,
-				 struct machine *machine __maybe_unused)
-{
-	struct intel_bts_synth *intel_bts_synth =
-			container_of(tool, struct intel_bts_synth, dummy_tool);
-
-	return perf_session__deliver_synth_event(intel_bts_synth->session,
-						 event, NULL);
-}
-
-static int intel_bts_synth_event(struct perf_session *session,
-				 struct perf_event_attr *attr, u64 id)
-{
-	struct intel_bts_synth intel_bts_synth;
-
-	memset(&intel_bts_synth, 0, sizeof(struct intel_bts_synth));
-	intel_bts_synth.session = session;
-
-	return perf_event__synthesize_attr(&intel_bts_synth.dummy_tool, attr, 1,
-					   &id, intel_bts_event_synth);
-}
-
 static int intel_bts_synth_events(struct intel_bts *bts,
 				  struct perf_session *session)
 {
@@ -814,7 +785,7 @@ static int intel_bts_synth_events(struct intel_bts *bts,
 		attr.sample_type |= PERF_SAMPLE_ADDR;
 		pr_debug("Synthesizing 'branches' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
 			 id, (u64)attr.sample_type);
-		err = intel_bts_synth_event(session, &attr, id);
+		err = perf_session__deliver_synth_attr_event(session, &attr, id);
 		if (err) {
 			pr_err("%s: failed to synthesize 'branches' event type\n",
 			       __func__);
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index d6d7b7512505..1fc03bfd14c3 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -3659,37 +3659,15 @@ static int intel_pt_queue_data(struct perf_session *session,
 					   data_offset, timestamp);
 }
 
-struct intel_pt_synth {
-	struct perf_tool dummy_tool;
-	struct perf_session *session;
-};
-
-static int intel_pt_event_synth(struct perf_tool *tool,
-				union perf_event *event,
-				struct perf_sample *sample __maybe_unused,
-				struct machine *machine __maybe_unused)
-{
-	struct intel_pt_synth *intel_pt_synth =
-			container_of(tool, struct intel_pt_synth, dummy_tool);
-
-	return perf_session__deliver_synth_event(intel_pt_synth->session, event,
-						 NULL);
-}
-
 static int intel_pt_synth_event(struct perf_session *session, const char *name,
 				struct perf_event_attr *attr, u64 id)
 {
-	struct intel_pt_synth intel_pt_synth;
 	int err;
 
 	pr_debug("Synthesizing '%s' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
 		 name, id, (u64)attr->sample_type);
 
-	memset(&intel_pt_synth, 0, sizeof(struct intel_pt_synth));
-	intel_pt_synth.session = session;
-
-	err = perf_event__synthesize_attr(&intel_pt_synth.dummy_tool, attr, 1,
-					  &id, intel_pt_event_synth);
+	err = perf_session__deliver_synth_attr_event(session, attr, id);
 	if (err)
 		pr_err("%s: failed to synthesize '%s' event type\n",
 		       __func__, name);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 453ca0297c80..aa02c1d7b1f9 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1769,6 +1769,30 @@ int perf_session__deliver_synth_event(struct perf_session *session,
 	return machines__deliver_event(&session->machines, evlist, event, sample, tool, 0, NULL);
 }
 
+int perf_session__deliver_synth_attr_event(struct perf_session *session,
+					   const struct perf_event_attr *attr,
+					   u64 id)
+{
+	union {
+		struct {
+			struct perf_record_header_attr attr;
+			u64 ids[1];
+		} attr_id;
+		union perf_event ev;
+	} ev = {
+		.attr_id.attr.header.type = PERF_RECORD_HEADER_ATTR,
+		.attr_id.attr.header.size = sizeof(ev.attr_id),
+		.attr_id.ids[0] = id,
+	};
+
+	if (attr->size != sizeof(ev.attr_id.attr.attr)) {
+		pr_debug("Unexpected perf_event_attr size\n");
+		return -EINVAL;
+	}
+	ev.attr_id.attr.attr = *attr;
+	return perf_session__deliver_synth_event(session, &ev.ev, NULL);
+}
+
 static void event_swap(union perf_event *event, bool sample_id_all)
 {
 	perf_event__swap_op swap;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 4c29dc86956f..cc28976bb00e 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -154,6 +154,9 @@ extern volatile int session_done;
 int perf_session__deliver_synth_event(struct perf_session *session,
 				      union perf_event *event,
 				      struct perf_sample *sample);
+int perf_session__deliver_synth_attr_event(struct perf_session *session,
+					   const struct perf_event_attr *attr,
+					   u64 id);
 
 int perf_session__dsos_hit_all(struct perf_session *session);
 
-- 
cgit v1.2.3


From 1816dc4bc5be050cc543e6e4fcedf3805a2aea20 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:46:54 -0700
Subject: perf s390-cpumsf: Remove unused struct

struct s390_cpumsf_synth was likely cargo culted from other auxtrace
examples. It has no users, so remove.

Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/s390-cpumsf.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c
index 6fe478b0b61b..4ec583e511af 100644
--- a/tools/perf/util/s390-cpumsf.c
+++ b/tools/perf/util/s390-cpumsf.c
@@ -952,11 +952,6 @@ s390_cpumsf_process_event(struct perf_session *session,
 	return err;
 }
 
-struct s390_cpumsf_synth {
-	struct perf_tool cpumsf_tool;
-	struct perf_session *session;
-};
-
 static int
 s390_cpumsf_process_auxtrace_event(struct perf_session *session,
 				   union perf_event *event __maybe_unused,
-- 
cgit v1.2.3


From 30f29bae9142f34e978a4861ed07aa512af21416 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:46:55 -0700
Subject: perf tool: Constify tool pointers

The tool pointer (to a struct largely of function pointers) is passed
around but is unchanged except at initialization. Change parameter and
variable types to be const to lower the possibilities of what could
happen with a tool.

Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Adrian Hunter <adrian.hunter@intel.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-4-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/util/event.c    |  4 +-
 tools/perf/bench/synthesize.c       |  2 +-
 tools/perf/builtin-annotate.c       |  2 +-
 tools/perf/builtin-c2c.c            |  2 +-
 tools/perf/builtin-diff.c           |  2 +-
 tools/perf/builtin-inject.c         | 70 ++++++++++++++++----------------
 tools/perf/builtin-kmem.c           |  3 +-
 tools/perf/builtin-kvm.c            |  4 +-
 tools/perf/builtin-kwork.c          | 22 +++++-----
 tools/perf/builtin-lock.c           |  4 +-
 tools/perf/builtin-mem.c            |  4 +-
 tools/perf/builtin-record.c         | 14 +++----
 tools/perf/builtin-report.c         | 12 +++---
 tools/perf/builtin-sched.c          | 34 ++++++++--------
 tools/perf/builtin-script.c         | 41 ++++++++++---------
 tools/perf/builtin-stat.c           |  8 ++--
 tools/perf/builtin-timechart.c      |  8 ++--
 tools/perf/builtin-top.c            |  2 +-
 tools/perf/builtin-trace.c          |  4 +-
 tools/perf/tests/cpumap.c           |  6 +--
 tools/perf/tests/dlfilter-test.c    |  2 +-
 tools/perf/tests/dwarf-unwind.c     |  2 +-
 tools/perf/tests/event_update.c     |  8 ++--
 tools/perf/tests/stat.c             |  6 +--
 tools/perf/tests/thread-map.c       |  2 +-
 tools/perf/util/arm-spe.c           |  6 +--
 tools/perf/util/auxtrace.c          | 12 +++---
 tools/perf/util/auxtrace.h          | 20 +++++-----
 tools/perf/util/bpf-event.c         |  4 +-
 tools/perf/util/build-id.c          |  4 +-
 tools/perf/util/build-id.h          |  4 +-
 tools/perf/util/cs-etm.c            |  6 +--
 tools/perf/util/data-convert-bt.c   |  4 +-
 tools/perf/util/data-convert-json.c |  4 +-
 tools/perf/util/event.c             | 34 ++++++++--------
 tools/perf/util/event.h             | 34 ++++++++--------
 tools/perf/util/header.c            |  6 +--
 tools/perf/util/header.h            |  4 +-
 tools/perf/util/hisi-ptt.c          |  6 +--
 tools/perf/util/intel-bts.c         |  6 +--
 tools/perf/util/intel-pt.c          |  6 +--
 tools/perf/util/jitdump.c           |  4 +-
 tools/perf/util/s390-cpumsf.c       |  6 +--
 tools/perf/util/session.c           | 30 +++++++-------
 tools/perf/util/session.h           |  2 +-
 tools/perf/util/synthetic-events.c  | 80 ++++++++++++++++++-------------------
 tools/perf/util/synthetic-events.h  | 70 ++++++++++++++++----------------
 tools/perf/util/tool.h              |  8 ++--
 tools/perf/util/tsc.c               |  2 +-
 49 files changed, 314 insertions(+), 316 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c
index e65b7dbe27fb..a0400707180c 100644
--- a/tools/perf/arch/x86/util/event.c
+++ b/tools/perf/arch/x86/util/event.c
@@ -15,7 +15,7 @@
 #if defined(__x86_64__)
 
 struct perf_event__synthesize_extra_kmaps_cb_args {
-	struct perf_tool *tool;
+	const struct perf_tool *tool;
 	perf_event__handler_t process;
 	struct machine *machine;
 	union perf_event *event;
@@ -65,7 +65,7 @@ static int perf_event__synthesize_extra_kmaps_cb(struct map *map, void *data)
 	return 0;
 }
 
-int perf_event__synthesize_extra_kmaps(struct perf_tool *tool,
+int perf_event__synthesize_extra_kmaps(const struct perf_tool *tool,
 				       perf_event__handler_t process,
 				       struct machine *machine)
 {
diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c
index 7401ebbac100..9b333276cbdb 100644
--- a/tools/perf/bench/synthesize.c
+++ b/tools/perf/bench/synthesize.c
@@ -49,7 +49,7 @@ static const char *const bench_usage[] = {
 
 static atomic_t event_count;
 
-static int process_synthesized_event(struct perf_tool *tool __maybe_unused,
+static int process_synthesized_event(const struct perf_tool *tool __maybe_unused,
 				     union perf_event *event __maybe_unused,
 				     struct perf_sample *sample __maybe_unused,
 				     struct machine *machine __maybe_unused)
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 1bfe41783a7c..cc65e6f8f4da 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -279,7 +279,7 @@ static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample,
 	return ret;
 }
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index c157bd31f2e5..88c131d05186 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -273,7 +273,7 @@ static void compute_stats(struct c2c_hist_entry *c2c_he,
 		update_stats(&cstats->load, weight);
 }
 
-static int process_sample_event(struct perf_tool *tool __maybe_unused,
+static int process_sample_event(const struct perf_tool *tool __maybe_unused,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 57d300d8e570..4c0567882a7a 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -388,7 +388,7 @@ struct hist_entry_ops block_hist_ops = {
 	.free   = block_hist_free,
 };
 
-static int diff__process_sample_event(struct perf_tool *tool,
+static int diff__process_sample_event(const struct perf_tool *tool,
 				      union perf_event *event,
 				      struct perf_sample *sample,
 				      struct evsel *evsel,
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 8f3dd8980da0..283429ccd034 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -134,7 +134,7 @@ struct event_entry {
 	union perf_event event[];
 };
 
-static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool,
+static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool,
 				struct machine *machine, u8 cpumode, u32 flags);
 
 static int output_bytes(struct perf_inject *inject, void *buf, size_t sz)
@@ -149,7 +149,7 @@ static int output_bytes(struct perf_inject *inject, void *buf, size_t sz)
 	return 0;
 }
 
-static int perf_event__repipe_synth(struct perf_tool *tool,
+static int perf_event__repipe_synth(const struct perf_tool *tool,
 				    union perf_event *event)
 {
 	struct perf_inject *inject = container_of(tool, struct perf_inject,
@@ -158,7 +158,7 @@ static int perf_event__repipe_synth(struct perf_tool *tool,
 	return output_bytes(inject, event, event->header.size);
 }
 
-static int perf_event__repipe_oe_synth(struct perf_tool *tool,
+static int perf_event__repipe_oe_synth(const struct perf_tool *tool,
 				       union perf_event *event,
 				       struct ordered_events *oe __maybe_unused)
 {
@@ -166,7 +166,7 @@ static int perf_event__repipe_oe_synth(struct perf_tool *tool,
 }
 
 #ifdef HAVE_JITDUMP
-static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused,
+static int perf_event__drop_oe(const struct perf_tool *tool __maybe_unused,
 			       union perf_event *event __maybe_unused,
 			       struct ordered_events *oe __maybe_unused)
 {
@@ -188,7 +188,7 @@ static int perf_event__repipe_op4_synth(struct perf_session *session,
 	return perf_event__repipe_synth(session->tool, event);
 }
 
-static int perf_event__repipe_attr(struct perf_tool *tool,
+static int perf_event__repipe_attr(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct evlist **pevlist)
 {
@@ -206,7 +206,7 @@ static int perf_event__repipe_attr(struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
-static int perf_event__repipe_event_update(struct perf_tool *tool,
+static int perf_event__repipe_event_update(const struct perf_tool *tool,
 					   union perf_event *event,
 					   struct evlist **pevlist __maybe_unused)
 {
@@ -237,7 +237,7 @@ static int copy_bytes(struct perf_inject *inject, struct perf_data *data, off_t
 static s64 perf_event__repipe_auxtrace(struct perf_session *session,
 				       union perf_event *event)
 {
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct perf_inject *inject = container_of(tool, struct perf_inject,
 						  tool);
 	int ret;
@@ -284,7 +284,7 @@ perf_event__repipe_auxtrace(struct perf_session *session __maybe_unused,
 
 #endif
 
-static int perf_event__repipe(struct perf_tool *tool,
+static int perf_event__repipe(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -292,7 +292,7 @@ static int perf_event__repipe(struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
-static int perf_event__drop(struct perf_tool *tool __maybe_unused,
+static int perf_event__drop(const struct perf_tool *tool __maybe_unused,
 			    union perf_event *event __maybe_unused,
 			    struct perf_sample *sample __maybe_unused,
 			    struct machine *machine __maybe_unused)
@@ -300,7 +300,7 @@ static int perf_event__drop(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int perf_event__drop_aux(struct perf_tool *tool,
+static int perf_event__drop_aux(const struct perf_tool *tool,
 				union perf_event *event __maybe_unused,
 				struct perf_sample *sample,
 				struct machine *machine __maybe_unused)
@@ -341,13 +341,13 @@ perf_inject__cut_auxtrace_sample(struct perf_inject *inject,
 	return ev;
 }
 
-typedef int (*inject_handler)(struct perf_tool *tool,
+typedef int (*inject_handler)(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample,
 			      struct evsel *evsel,
 			      struct machine *machine);
 
-static int perf_event__repipe_sample(struct perf_tool *tool,
+static int perf_event__repipe_sample(const struct perf_tool *tool,
 				     union perf_event *event,
 				     struct perf_sample *sample,
 				     struct evsel *evsel,
@@ -372,7 +372,7 @@ static int perf_event__repipe_sample(struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
-static int perf_event__repipe_mmap(struct perf_tool *tool,
+static int perf_event__repipe_mmap(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine)
@@ -386,7 +386,7 @@ static int perf_event__repipe_mmap(struct perf_tool *tool,
 }
 
 #ifdef HAVE_JITDUMP
-static int perf_event__jit_repipe_mmap(struct perf_tool *tool,
+static int perf_event__jit_repipe_mmap(const struct perf_tool *tool,
 				       union perf_event *event,
 				       struct perf_sample *sample,
 				       struct machine *machine)
@@ -455,7 +455,7 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename,
 	return dso;
 }
 
-static int perf_event__repipe_buildid_mmap(struct perf_tool *tool,
+static int perf_event__repipe_buildid_mmap(const struct perf_tool *tool,
 					   union perf_event *event,
 					   struct perf_sample *sample,
 					   struct machine *machine)
@@ -474,7 +474,7 @@ static int perf_event__repipe_buildid_mmap(struct perf_tool *tool,
 	return perf_event__repipe(tool, event, sample, machine);
 }
 
-static int perf_event__repipe_mmap2(struct perf_tool *tool,
+static int perf_event__repipe_mmap2(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine)
@@ -500,7 +500,7 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool,
 }
 
 #ifdef HAVE_JITDUMP
-static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
+static int perf_event__jit_repipe_mmap2(const struct perf_tool *tool,
 					union perf_event *event,
 					struct perf_sample *sample,
 					struct machine *machine)
@@ -524,7 +524,7 @@ static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
 }
 #endif
 
-static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool,
+static int perf_event__repipe_buildid_mmap2(const struct perf_tool *tool,
 					    union perf_event *event,
 					    struct perf_sample *sample,
 					    struct machine *machine)
@@ -565,7 +565,7 @@ static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool,
 	return 0;
 }
 
-static int perf_event__repipe_fork(struct perf_tool *tool,
+static int perf_event__repipe_fork(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine)
@@ -578,7 +578,7 @@ static int perf_event__repipe_fork(struct perf_tool *tool,
 	return err;
 }
 
-static int perf_event__repipe_comm(struct perf_tool *tool,
+static int perf_event__repipe_comm(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine)
@@ -591,7 +591,7 @@ static int perf_event__repipe_comm(struct perf_tool *tool,
 	return err;
 }
 
-static int perf_event__repipe_namespaces(struct perf_tool *tool,
+static int perf_event__repipe_namespaces(const struct perf_tool *tool,
 					 union perf_event *event,
 					 struct perf_sample *sample,
 					 struct machine *machine)
@@ -603,7 +603,7 @@ static int perf_event__repipe_namespaces(struct perf_tool *tool,
 	return err;
 }
 
-static int perf_event__repipe_exit(struct perf_tool *tool,
+static int perf_event__repipe_exit(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine)
@@ -712,7 +712,7 @@ static bool perf_inject__lookup_known_build_id(struct perf_inject *inject,
 	return false;
 }
 
-static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool,
+static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool,
 				struct machine *machine, u8 cpumode, u32 flags)
 {
 	struct perf_inject *inject = container_of(tool, struct perf_inject,
@@ -743,7 +743,7 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool,
 	return 0;
 }
 
-int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event,
+int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *event,
 			       struct perf_sample *sample,
 			       struct evsel *evsel __maybe_unused,
 			       struct machine *machine)
@@ -776,7 +776,7 @@ repipe:
 	return 0;
 }
 
-static int perf_inject__sched_process_exit(struct perf_tool *tool,
+static int perf_inject__sched_process_exit(const struct perf_tool *tool,
 					   union perf_event *event __maybe_unused,
 					   struct perf_sample *sample,
 					   struct evsel *evsel __maybe_unused,
@@ -796,7 +796,7 @@ static int perf_inject__sched_process_exit(struct perf_tool *tool,
 	return 0;
 }
 
-static int perf_inject__sched_switch(struct perf_tool *tool,
+static int perf_inject__sched_switch(const struct perf_tool *tool,
 				     union perf_event *event,
 				     struct perf_sample *sample,
 				     struct evsel *evsel,
@@ -821,7 +821,7 @@ static int perf_inject__sched_switch(struct perf_tool *tool,
 }
 
 #ifdef HAVE_LIBTRACEEVENT
-static int perf_inject__sched_stat(struct perf_tool *tool,
+static int perf_inject__sched_stat(const struct perf_tool *tool,
 				   union perf_event *event __maybe_unused,
 				   struct perf_sample *sample,
 				   struct evsel *evsel,
@@ -866,7 +866,7 @@ static int guest_session__output_bytes(struct guest_session *gs, void *buf, size
 	return ret < 0 ? ret : 0;
 }
 
-static int guest_session__repipe(struct perf_tool *tool,
+static int guest_session__repipe(const struct perf_tool *tool,
 				 union perf_event *event,
 				 struct perf_sample *sample __maybe_unused,
 				 struct machine *machine __maybe_unused)
@@ -1032,7 +1032,7 @@ static struct guest_id *guest_session__lookup_id(struct guest_session *gs, u64 i
 	return NULL;
 }
 
-static int process_attr(struct perf_tool *tool, union perf_event *event,
+static int process_attr(const struct perf_tool *tool, union perf_event *event,
 			struct perf_sample *sample __maybe_unused,
 			struct machine *machine __maybe_unused)
 {
@@ -1160,7 +1160,7 @@ static u64 evlist__first_id(struct evlist *evlist)
 	return 0;
 }
 
-static int process_build_id(struct perf_tool *tool,
+static int process_build_id(const struct perf_tool *tool,
 			    union perf_event *event,
 			    struct perf_sample *sample __maybe_unused,
 			    struct machine *machine __maybe_unused)
@@ -1210,7 +1210,7 @@ static int guest_session__add_build_ids(struct guest_session *gs)
 				  gs);
 }
 
-static int guest_session__ksymbol_event(struct perf_tool *tool,
+static int guest_session__ksymbol_event(const struct perf_tool *tool,
 					union perf_event *event,
 					struct perf_sample *sample __maybe_unused,
 					struct machine *machine __maybe_unused)
@@ -1574,7 +1574,7 @@ static int guest_session__flush_events(struct guest_session *gs)
 	return guest_session__inject_events(gs, -1);
 }
 
-static int host__repipe(struct perf_tool *tool,
+static int host__repipe(const struct perf_tool *tool,
 			union perf_event *event,
 			struct perf_sample *sample,
 			struct machine *machine)
@@ -1647,7 +1647,7 @@ static int host__finished_init(struct perf_session *session, union perf_event *e
  * guest events up to the same time. Finally write out the FINISHED_ROUND event
  * itself.
  */
-static int host__finished_round(struct perf_tool *tool,
+static int host__finished_round(const struct perf_tool *tool,
 				union perf_event *event,
 				struct ordered_events *oe)
 {
@@ -1665,7 +1665,7 @@ static int host__finished_round(struct perf_tool *tool,
 	return perf_event__repipe_oe_synth(tool, event, oe);
 }
 
-static int host__context_switch(struct perf_tool *tool,
+static int host__context_switch(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct machine *machine)
@@ -1719,7 +1719,7 @@ static int evsel__check_stype(struct evsel *evsel, u64 sample_type, const char *
 	return 0;
 }
 
-static int drop_sample(struct perf_tool *tool __maybe_unused,
+static int drop_sample(const struct perf_tool *tool __maybe_unused,
 		       union perf_event *event __maybe_unused,
 		       struct perf_sample *sample __maybe_unused,
 		       struct evsel *evsel __maybe_unused,
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 6fd95be5032b..859ff018eace 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -955,7 +955,7 @@ static bool perf_kmem__skip_sample(struct perf_sample *sample)
 typedef int (*tracepoint_handler)(struct evsel *evsel,
 				  struct perf_sample *sample);
 
-static int process_sample_event(struct perf_tool *tool __maybe_unused,
+static int process_sample_event(const struct perf_tool *tool __maybe_unused,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
@@ -2061,4 +2061,3 @@ out_delete:
 
 	return ret;
 }
-
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 71165036e4ca..a3b903cf4311 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1166,7 +1166,7 @@ static void print_result(struct perf_kvm_stat *kvm)
 }
 
 #if defined(HAVE_TIMERFD_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
-static int process_lost_event(struct perf_tool *tool,
+static int process_lost_event(const struct perf_tool *tool,
 			      union perf_event *event __maybe_unused,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -1187,7 +1187,7 @@ static bool skip_sample(struct perf_kvm_stat *kvm,
 	return false;
 }
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c
index 56e3f3a5e03a..8ffaa80a2d1d 100644
--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -958,7 +958,7 @@ static int top_sched_switch_event(struct perf_kwork *kwork,
 }
 
 static struct kwork_class kwork_irq;
-static int process_irq_handler_entry_event(struct perf_tool *tool,
+static int process_irq_handler_entry_event(const struct perf_tool *tool,
 					   struct evsel *evsel,
 					   struct perf_sample *sample,
 					   struct machine *machine)
@@ -971,7 +971,7 @@ static int process_irq_handler_entry_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_irq_handler_exit_event(struct perf_tool *tool,
+static int process_irq_handler_exit_event(const struct perf_tool *tool,
 					  struct evsel *evsel,
 					  struct perf_sample *sample,
 					  struct machine *machine)
@@ -1037,7 +1037,7 @@ static struct kwork_class kwork_irq = {
 };
 
 static struct kwork_class kwork_softirq;
-static int process_softirq_raise_event(struct perf_tool *tool,
+static int process_softirq_raise_event(const struct perf_tool *tool,
 				       struct evsel *evsel,
 				       struct perf_sample *sample,
 				       struct machine *machine)
@@ -1051,7 +1051,7 @@ static int process_softirq_raise_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_softirq_entry_event(struct perf_tool *tool,
+static int process_softirq_entry_event(const struct perf_tool *tool,
 				       struct evsel *evsel,
 				       struct perf_sample *sample,
 				       struct machine *machine)
@@ -1065,7 +1065,7 @@ static int process_softirq_entry_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_softirq_exit_event(struct perf_tool *tool,
+static int process_softirq_exit_event(const struct perf_tool *tool,
 				      struct evsel *evsel,
 				      struct perf_sample *sample,
 				      struct machine *machine)
@@ -1167,7 +1167,7 @@ static struct kwork_class kwork_softirq = {
 };
 
 static struct kwork_class kwork_workqueue;
-static int process_workqueue_activate_work_event(struct perf_tool *tool,
+static int process_workqueue_activate_work_event(const struct perf_tool *tool,
 						 struct evsel *evsel,
 						 struct perf_sample *sample,
 						 struct machine *machine)
@@ -1181,7 +1181,7 @@ static int process_workqueue_activate_work_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_workqueue_execute_start_event(struct perf_tool *tool,
+static int process_workqueue_execute_start_event(const struct perf_tool *tool,
 						 struct evsel *evsel,
 						 struct perf_sample *sample,
 						 struct machine *machine)
@@ -1195,7 +1195,7 @@ static int process_workqueue_execute_start_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_workqueue_execute_end_event(struct perf_tool *tool,
+static int process_workqueue_execute_end_event(const struct perf_tool *tool,
 					       struct evsel *evsel,
 					       struct perf_sample *sample,
 					       struct machine *machine)
@@ -1266,7 +1266,7 @@ static struct kwork_class kwork_workqueue = {
 };
 
 static struct kwork_class kwork_sched;
-static int process_sched_switch_event(struct perf_tool *tool,
+static int process_sched_switch_event(const struct perf_tool *tool,
 				      struct evsel *evsel,
 				      struct perf_sample *sample,
 				      struct machine *machine)
@@ -1945,12 +1945,12 @@ static int perf_kwork__report(struct perf_kwork *kwork)
 	return 0;
 }
 
-typedef int (*tracepoint_handler)(struct perf_tool *tool,
+typedef int (*tracepoint_handler)(const struct perf_tool *tool,
 				  struct evsel *evsel,
 				  struct perf_sample *sample,
 				  struct machine *machine);
 
-static int perf_kwork__process_tracepoint_sample(struct perf_tool *tool,
+static int perf_kwork__process_tracepoint_sample(const struct perf_tool *tool,
 						 union perf_event *event __maybe_unused,
 						 struct perf_sample *sample,
 						 struct evsel *evsel,
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 0253184b3b58..6efa9d646637 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -1501,7 +1501,7 @@ static const struct evsel_str_handler contention_tracepoints[] = {
 	{ "lock:contention_end",   evsel__process_contention_end,   },
 };
 
-static int process_event_update(struct perf_tool *tool,
+static int process_event_update(const struct perf_tool *tool,
 				union perf_event *event,
 				struct evlist **pevlist)
 {
@@ -1520,7 +1520,7 @@ static int process_event_update(struct perf_tool *tool,
 typedef int (*tracepoint_handler)(struct evsel *evsel,
 				  struct perf_sample *sample);
 
-static int process_sample_event(struct perf_tool *tool __maybe_unused,
+static int process_sample_event(const struct perf_tool *tool __maybe_unused,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index efa700d14c98..0fb4d75ab959 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -179,7 +179,7 @@ out:
 }
 
 static int
-dump_raw_samples(struct perf_tool *tool,
+dump_raw_samples(const struct perf_tool *tool,
 		 union perf_event *event,
 		 struct perf_sample *sample,
 		 struct machine *machine)
@@ -253,7 +253,7 @@ out_put:
 	return 0;
 }
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel __maybe_unused,
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 72345d1e54b0..46410eb3a76b 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -609,7 +609,7 @@ static int record__comp_enabled(struct record *rec)
 	return rec->opts.comp_level > 0;
 }
 
-static int process_synthesized_event(struct perf_tool *tool,
+static int process_synthesized_event(const struct perf_tool *tool,
 				     union perf_event *event,
 				     struct perf_sample *sample __maybe_unused,
 				     struct machine *machine __maybe_unused)
@@ -620,7 +620,7 @@ static int process_synthesized_event(struct perf_tool *tool,
 
 static struct mutex synth_lock;
 
-static int process_locked_synthesized_event(struct perf_tool *tool,
+static int process_locked_synthesized_event(const struct perf_tool *tool,
 				     union perf_event *event,
 				     struct perf_sample *sample __maybe_unused,
 				     struct machine *machine __maybe_unused)
@@ -705,7 +705,7 @@ static void record__sig_exit(void)
 
 #ifdef HAVE_AUXTRACE_SUPPORT
 
-static int record__process_auxtrace(struct perf_tool *tool,
+static int record__process_auxtrace(const struct perf_tool *tool,
 				    struct mmap *map,
 				    union perf_event *event, void *data1,
 				    size_t len1, void *data2, size_t len2)
@@ -1417,7 +1417,7 @@ static void set_timestamp_boundary(struct record *rec, u64 sample_time)
 		rec->evlist->last_sample_time = sample_time;
 }
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
@@ -3244,7 +3244,7 @@ static const char * const __record_usage[] = {
 };
 const char * const *record_usage = __record_usage;
 
-static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
+static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
 				  struct perf_sample *sample, struct machine *machine)
 {
 	/*
@@ -3256,7 +3256,7 @@ static int build_id__process_mmap(struct perf_tool *tool, union perf_event *even
 	return perf_event__process_mmap(tool, event, sample, machine);
 }
 
-static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
+static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
 				   struct perf_sample *sample, struct machine *machine)
 {
 	/*
@@ -3269,7 +3269,7 @@ static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *eve
 	return perf_event__process_mmap2(tool, event, sample, machine);
 }
 
-static int process_timestamp_boundary(struct perf_tool *tool,
+static int process_timestamp_boundary(const struct perf_tool *tool,
 				      union perf_event *event __maybe_unused,
 				      struct perf_sample *sample,
 				      struct machine *machine __maybe_unused)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 930052961c1a..5f609a7791ea 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -263,7 +263,7 @@ static int process_feature_event(struct perf_session *session,
 	return 0;
 }
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
@@ -339,7 +339,7 @@ out_put:
 	return ret;
 }
 
-static int process_read_event(struct perf_tool *tool,
+static int process_read_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct evsel *evsel,
@@ -765,7 +765,7 @@ static void report__output_resort(struct report *rep)
 	ui_progress__finish();
 }
 
-static int count_sample_event(struct perf_tool *tool __maybe_unused,
+static int count_sample_event(const struct perf_tool *tool __maybe_unused,
 			      union perf_event *event __maybe_unused,
 			      struct perf_sample *sample __maybe_unused,
 			      struct evsel *evsel,
@@ -777,7 +777,7 @@ static int count_sample_event(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int count_lost_samples_event(struct perf_tool *tool,
+static int count_lost_samples_event(const struct perf_tool *tool,
 				    union perf_event *event,
 				    struct perf_sample *sample,
 				    struct machine *machine __maybe_unused)
@@ -793,7 +793,7 @@ static int count_lost_samples_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_attr(struct perf_tool *tool __maybe_unused,
+static int process_attr(const struct perf_tool *tool __maybe_unused,
 			union perf_event *event,
 			struct evlist **pevlist);
 
@@ -1233,7 +1233,7 @@ parse_percent_limit(const struct option *opt, const char *str,
 	return 0;
 }
 
-static int process_attr(struct perf_tool *tool __maybe_unused,
+static int process_attr(const struct perf_tool *tool __maybe_unused,
 			union perf_event *event,
 			struct evlist **pevlist)
 {
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 8750b5f2d49b..2c60bd3a8149 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1489,7 +1489,7 @@ again:
 	}
 }
 
-static int process_sched_wakeup_event(struct perf_tool *tool,
+static int process_sched_wakeup_event(const struct perf_tool *tool,
 				      struct evsel *evsel,
 				      struct perf_sample *sample,
 				      struct machine *machine)
@@ -1502,7 +1502,7 @@ static int process_sched_wakeup_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused,
+static int process_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
 				      struct evsel *evsel __maybe_unused,
 				      struct perf_sample *sample __maybe_unused,
 				      struct machine *machine __maybe_unused)
@@ -1770,7 +1770,7 @@ out:
 	return 0;
 }
 
-static int process_sched_switch_event(struct perf_tool *tool,
+static int process_sched_switch_event(const struct perf_tool *tool,
 				      struct evsel *evsel,
 				      struct perf_sample *sample,
 				      struct machine *machine)
@@ -1796,7 +1796,7 @@ static int process_sched_switch_event(struct perf_tool *tool,
 	return err;
 }
 
-static int process_sched_runtime_event(struct perf_tool *tool,
+static int process_sched_runtime_event(const struct perf_tool *tool,
 				       struct evsel *evsel,
 				       struct perf_sample *sample,
 				       struct machine *machine)
@@ -1809,7 +1809,7 @@ static int process_sched_runtime_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int perf_sched__process_fork_event(struct perf_tool *tool,
+static int perf_sched__process_fork_event(const struct perf_tool *tool,
 					  union perf_event *event,
 					  struct perf_sample *sample,
 					  struct machine *machine)
@@ -1826,7 +1826,7 @@ static int perf_sched__process_fork_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_sched_migrate_task_event(struct perf_tool *tool,
+static int process_sched_migrate_task_event(const struct perf_tool *tool,
 					    struct evsel *evsel,
 					    struct perf_sample *sample,
 					    struct machine *machine)
@@ -1839,12 +1839,12 @@ static int process_sched_migrate_task_event(struct perf_tool *tool,
 	return 0;
 }
 
-typedef int (*tracepoint_handler)(struct perf_tool *tool,
+typedef int (*tracepoint_handler)(const struct perf_tool *tool,
 				  struct evsel *evsel,
 				  struct perf_sample *sample,
 				  struct machine *machine);
 
-static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
+static int perf_sched__process_tracepoint_sample(const struct perf_tool *tool __maybe_unused,
 						 union perf_event *event __maybe_unused,
 						 struct perf_sample *sample,
 						 struct evsel *evsel,
@@ -1860,7 +1860,7 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_
 	return err;
 }
 
-static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
+static int perf_sched__process_comm(const struct perf_tool *tool __maybe_unused,
 				    union perf_event *event,
 				    struct perf_sample *sample,
 				    struct machine *machine)
@@ -2506,7 +2506,7 @@ static void timehist_print_wakeup_event(struct perf_sched *sched,
 	printf("\n");
 }
 
-static int timehist_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused,
+static int timehist_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
 					union perf_event *event __maybe_unused,
 					struct evsel *evsel __maybe_unused,
 					struct perf_sample *sample __maybe_unused,
@@ -2515,7 +2515,7 @@ static int timehist_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int timehist_sched_wakeup_event(struct perf_tool *tool,
+static int timehist_sched_wakeup_event(const struct perf_tool *tool,
 				       union perf_event *event __maybe_unused,
 				       struct evsel *evsel,
 				       struct perf_sample *sample,
@@ -2599,7 +2599,7 @@ static void timehist_print_migration_event(struct perf_sched *sched,
 	printf("\n");
 }
 
-static int timehist_migrate_task_event(struct perf_tool *tool,
+static int timehist_migrate_task_event(const struct perf_tool *tool,
 				       union perf_event *event __maybe_unused,
 				       struct evsel *evsel,
 				       struct perf_sample *sample,
@@ -2627,7 +2627,7 @@ static int timehist_migrate_task_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int timehist_sched_change_event(struct perf_tool *tool,
+static int timehist_sched_change_event(const struct perf_tool *tool,
 				       union perf_event *event,
 				       struct evsel *evsel,
 				       struct perf_sample *sample,
@@ -2758,7 +2758,7 @@ out:
 	return rc;
 }
 
-static int timehist_sched_switch_event(struct perf_tool *tool,
+static int timehist_sched_switch_event(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct evsel *evsel,
 			     struct perf_sample *sample,
@@ -2767,7 +2767,7 @@ static int timehist_sched_switch_event(struct perf_tool *tool,
 	return timehist_sched_change_event(tool, event, evsel, sample, machine);
 }
 
-static int process_lost(struct perf_tool *tool __maybe_unused,
+static int process_lost(const struct perf_tool *tool __maybe_unused,
 			union perf_event *event,
 			struct perf_sample *sample,
 			struct machine *machine __maybe_unused)
@@ -3010,13 +3010,13 @@ static void timehist_print_summary(struct perf_sched *sched,
 	printf(" (x %d)\n", sched->max_cpu.cpu);
 }
 
-typedef int (*sched_handler)(struct perf_tool *tool,
+typedef int (*sched_handler)(const struct perf_tool *tool,
 			  union perf_event *event,
 			  struct evsel *evsel,
 			  struct perf_sample *sample,
 			  struct machine *machine);
 
-static int perf_timehist__process_sample(struct perf_tool *tool,
+static int perf_timehist__process_sample(const struct perf_tool *tool,
 					 union perf_event *event,
 					 struct perf_sample *sample,
 					 struct evsel *evsel,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 932167b2362b..31a6f800d55f 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2399,7 +2399,7 @@ static bool filter_cpu(struct perf_sample *sample)
 	return false;
 }
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
@@ -2486,7 +2486,7 @@ out_put:
 // Used when scr->per_event_dump is not set
 static struct evsel_script es_stdout;
 
-static int process_attr(struct perf_tool *tool, union perf_event *event,
+static int process_attr(const struct perf_tool *tool, union perf_event *event,
 			struct evlist **pevlist)
 {
 	struct perf_script *scr = container_of(tool, struct perf_script, tool);
@@ -2552,7 +2552,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event,
 	return 0;
 }
 
-static int print_event_with_time(struct perf_tool *tool,
+static int print_event_with_time(const struct perf_tool *tool,
 				 union perf_event *event,
 				 struct perf_sample *sample,
 				 struct machine *machine,
@@ -2588,14 +2588,14 @@ static int print_event_with_time(struct perf_tool *tool,
 	return 0;
 }
 
-static int print_event(struct perf_tool *tool, union perf_event *event,
+static int print_event(const struct perf_tool *tool, union perf_event *event,
 		       struct perf_sample *sample, struct machine *machine,
 		       pid_t pid, pid_t tid)
 {
 	return print_event_with_time(tool, event, sample, machine, pid, tid, 0);
 }
 
-static int process_comm_event(struct perf_tool *tool,
+static int process_comm_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample,
 			      struct machine *machine)
@@ -2607,7 +2607,7 @@ static int process_comm_event(struct perf_tool *tool,
 			   event->comm.tid);
 }
 
-static int process_namespaces_event(struct perf_tool *tool,
+static int process_namespaces_event(const struct perf_tool *tool,
 				    union perf_event *event,
 				    struct perf_sample *sample,
 				    struct machine *machine)
@@ -2619,7 +2619,7 @@ static int process_namespaces_event(struct perf_tool *tool,
 			   event->namespaces.tid);
 }
 
-static int process_cgroup_event(struct perf_tool *tool,
+static int process_cgroup_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct machine *machine)
@@ -2631,7 +2631,7 @@ static int process_cgroup_event(struct perf_tool *tool,
 			    sample->tid);
 }
 
-static int process_fork_event(struct perf_tool *tool,
+static int process_fork_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample,
 			      struct machine *machine)
@@ -2643,7 +2643,7 @@ static int process_fork_event(struct perf_tool *tool,
 				     event->fork.pid, event->fork.tid,
 				     event->fork.time);
 }
-static int process_exit_event(struct perf_tool *tool,
+static int process_exit_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample,
 			      struct machine *machine)
@@ -2656,7 +2656,7 @@ static int process_exit_event(struct perf_tool *tool,
 	return perf_event__process_exit(tool, event, sample, machine);
 }
 
-static int process_mmap_event(struct perf_tool *tool,
+static int process_mmap_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample,
 			      struct machine *machine)
@@ -2668,7 +2668,7 @@ static int process_mmap_event(struct perf_tool *tool,
 			   event->mmap.tid);
 }
 
-static int process_mmap2_event(struct perf_tool *tool,
+static int process_mmap2_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample,
 			      struct machine *machine)
@@ -2680,7 +2680,7 @@ static int process_mmap2_event(struct perf_tool *tool,
 			   event->mmap2.tid);
 }
 
-static int process_switch_event(struct perf_tool *tool,
+static int process_switch_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct machine *machine)
@@ -2712,7 +2712,7 @@ static int process_auxtrace_error(struct perf_session *session,
 }
 
 static int
-process_lost_event(struct perf_tool *tool,
+process_lost_event(const struct perf_tool *tool,
 		   union perf_event *event,
 		   struct perf_sample *sample,
 		   struct machine *machine)
@@ -2722,7 +2722,7 @@ process_lost_event(struct perf_tool *tool,
 }
 
 static int
-process_throttle_event(struct perf_tool *tool __maybe_unused,
+process_throttle_event(const struct perf_tool *tool __maybe_unused,
 		       union perf_event *event,
 		       struct perf_sample *sample,
 		       struct machine *machine)
@@ -2733,7 +2733,7 @@ process_throttle_event(struct perf_tool *tool __maybe_unused,
 }
 
 static int
-process_finished_round_event(struct perf_tool *tool __maybe_unused,
+process_finished_round_event(const struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct ordered_events *oe __maybe_unused)
 
@@ -2743,7 +2743,7 @@ process_finished_round_event(struct perf_tool *tool __maybe_unused,
 }
 
 static int
-process_bpf_events(struct perf_tool *tool __maybe_unused,
+process_bpf_events(const struct perf_tool *tool __maybe_unused,
 		   union perf_event *event,
 		   struct perf_sample *sample,
 		   struct machine *machine)
@@ -2755,7 +2755,7 @@ process_bpf_events(struct perf_tool *tool __maybe_unused,
 			   sample->tid);
 }
 
-static int process_text_poke_events(struct perf_tool *tool,
+static int process_text_poke_events(const struct perf_tool *tool,
 				    union perf_event *event,
 				    struct perf_sample *sample,
 				    struct machine *machine)
@@ -3757,7 +3757,7 @@ static
 int process_thread_map_event(struct perf_session *session,
 			     union perf_event *event)
 {
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct perf_script *script = container_of(tool, struct perf_script, tool);
 
 	if (dump_trace)
@@ -3779,7 +3779,7 @@ static
 int process_cpu_map_event(struct perf_session *session,
 			  union perf_event *event)
 {
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct perf_script *script = container_of(tool, struct perf_script, tool);
 
 	if (dump_trace)
@@ -3809,11 +3809,10 @@ static int process_feature_event(struct perf_session *session,
 static int perf_script__process_auxtrace_info(struct perf_session *session,
 					      union perf_event *event)
 {
-	struct perf_tool *tool = session->tool;
-
 	int ret = perf_event__process_auxtrace_info(session, event);
 
 	if (ret == 0) {
+		const struct perf_tool *tool = session->tool;
 		struct perf_script *script = container_of(tool, struct perf_script, tool);
 
 		ret = perf_script__setup_per_event_dump(script);
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 1f92445f7480..d44e708f8a44 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -248,7 +248,7 @@ static void perf_stat__reset_stats(void)
 	perf_stat__reset_shadow_stats();
 }
 
-static int process_synthesized_event(struct perf_tool *tool __maybe_unused,
+static int process_synthesized_event(const struct perf_tool *tool __maybe_unused,
 				     union perf_event *event,
 				     struct perf_sample *sample __maybe_unused,
 				     struct machine *machine __maybe_unused)
@@ -2180,7 +2180,7 @@ static
 int process_stat_config_event(struct perf_session *session,
 			      union perf_event *event)
 {
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct perf_stat *st = container_of(tool, struct perf_stat, tool);
 
 	perf_event__read_stat_config(&stat_config, &event->stat_config);
@@ -2229,7 +2229,7 @@ static
 int process_thread_map_event(struct perf_session *session,
 			     union perf_event *event)
 {
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct perf_stat *st = container_of(tool, struct perf_stat, tool);
 
 	if (st->threads) {
@@ -2248,7 +2248,7 @@ static
 int process_cpu_map_event(struct perf_session *session,
 			  union perf_event *event)
 {
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct perf_stat *st = container_of(tool, struct perf_stat, tool);
 	struct perf_cpu_map *cpus;
 
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 19d4542ea18a..5bf818baa662 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -320,7 +320,7 @@ static int *cpus_cstate_state;
 static u64 *cpus_pstate_start_times;
 static u64 *cpus_pstate_state;
 
-static int process_comm_event(struct perf_tool *tool,
+static int process_comm_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -330,7 +330,7 @@ static int process_comm_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_fork_event(struct perf_tool *tool,
+static int process_fork_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -340,7 +340,7 @@ static int process_fork_event(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_exit_event(struct perf_tool *tool,
+static int process_exit_event(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -571,7 +571,7 @@ typedef int (*tracepoint_handler)(struct timechart *tchart,
 				  struct perf_sample *sample,
 				  const char *backtrace);
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d1a06a88d693..1cdac4e190e9 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -740,7 +740,7 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter,
 	return 0;
 }
 
-static void perf_event__process_sample(struct perf_tool *tool,
+static void perf_event__process_sample(const struct perf_tool *tool,
 				       const union perf_event *event,
 				       struct evsel *evsel,
 				       struct perf_sample *sample,
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index ef951ce1a0dd..d6ca541fdc78 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1778,7 +1778,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine,
 	return ret;
 }
 
-static int trace__tool_process(struct perf_tool *tool,
+static int trace__tool_process(const struct perf_tool *tool,
 			       union perf_event *event,
 			       struct perf_sample *sample,
 			       struct machine *machine)
@@ -3179,7 +3179,7 @@ static void trace__set_base_time(struct trace *trace,
 		trace->base_time = sample->time;
 }
 
-static int trace__process_sample(struct perf_tool *tool,
+static int trace__process_sample(const struct perf_tool *tool,
 				 union perf_event *event,
 				 struct perf_sample *sample,
 				 struct evsel *evsel,
diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
index bd8e396f3e57..2f0168b2a5a9 100644
--- a/tools/perf/tests/cpumap.c
+++ b/tools/perf/tests/cpumap.c
@@ -11,7 +11,7 @@
 
 struct machine;
 
-static int process_event_mask(struct perf_tool *tool __maybe_unused,
+static int process_event_mask(const struct perf_tool *tool __maybe_unused,
 			 union perf_event *event,
 			 struct perf_sample *sample __maybe_unused,
 			 struct machine *machine __maybe_unused)
@@ -47,7 +47,7 @@ static int process_event_mask(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_event_cpus(struct perf_tool *tool __maybe_unused,
+static int process_event_cpus(const struct perf_tool *tool __maybe_unused,
 			 union perf_event *event,
 			 struct perf_sample *sample __maybe_unused,
 			 struct machine *machine __maybe_unused)
@@ -73,7 +73,7 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_event_range_cpus(struct perf_tool *tool __maybe_unused,
+static int process_event_range_cpus(const struct perf_tool *tool __maybe_unused,
 				union perf_event *event,
 				struct perf_sample *sample __maybe_unused,
 				struct machine *machine __maybe_unused)
diff --git a/tools/perf/tests/dlfilter-test.c b/tools/perf/tests/dlfilter-test.c
index da3a9b50b1b1..54f59d1246bc 100644
--- a/tools/perf/tests/dlfilter-test.c
+++ b/tools/perf/tests/dlfilter-test.c
@@ -62,7 +62,7 @@ static int test_result(const char *msg, int ret)
 	return ret;
 }
 
-static int process(struct perf_tool *tool, union perf_event *event,
+static int process(const struct perf_tool *tool, union perf_event *event,
 		   struct perf_sample *sample __maybe_unused,
 		   struct machine *machine __maybe_unused)
 {
diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index d01aa931fe81..f85d391ced98 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -37,7 +37,7 @@
 #define NO_TAIL_CALL_BARRIER __asm__ __volatile__("" : : : "memory");
 #endif
 
-static int mmap_handler(struct perf_tool *tool __maybe_unused,
+static int mmap_handler(const struct perf_tool *tool __maybe_unused,
 			union perf_event *event,
 			struct perf_sample *sample,
 			struct machine *machine)
diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index d093a9b878d1..fdecad920f59 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -12,7 +12,7 @@
 #include "tests.h"
 #include "debug.h"
 
-static int process_event_unit(struct perf_tool *tool __maybe_unused,
+static int process_event_unit(const struct perf_tool *tool __maybe_unused,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -25,7 +25,7 @@ static int process_event_unit(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_event_scale(struct perf_tool *tool __maybe_unused,
+static int process_event_scale(const struct perf_tool *tool __maybe_unused,
 			       union perf_event *event,
 			       struct perf_sample *sample __maybe_unused,
 			       struct machine *machine __maybe_unused)
@@ -43,7 +43,7 @@ struct event_name {
 	const char *name;
 };
 
-static int process_event_name(struct perf_tool *tool,
+static int process_event_name(const struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -57,7 +57,7 @@ static int process_event_name(struct perf_tool *tool,
 	return 0;
 }
 
-static int process_event_cpus(struct perf_tool *tool __maybe_unused,
+static int process_event_cpus(const struct perf_tool *tool __maybe_unused,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c
index 706780fb5695..6468cc0d0204 100644
--- a/tools/perf/tests/stat.c
+++ b/tools/perf/tests/stat.c
@@ -21,7 +21,7 @@ static bool has_term(struct perf_record_stat_config *config,
 	return false;
 }
 
-static int process_stat_config_event(struct perf_tool *tool __maybe_unused,
+static int process_stat_config_event(const struct perf_tool *tool __maybe_unused,
 				     union perf_event *event,
 				     struct perf_sample *sample __maybe_unused,
 				     struct machine *machine __maybe_unused)
@@ -62,7 +62,7 @@ static int test__synthesize_stat_config(struct test_suite *test __maybe_unused,
 	return 0;
 }
 
-static int process_stat_event(struct perf_tool *tool __maybe_unused,
+static int process_stat_event(const struct perf_tool *tool __maybe_unused,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -93,7 +93,7 @@ static int test__synthesize_stat(struct test_suite *test __maybe_unused, int sub
 	return 0;
 }
 
-static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
+static int process_stat_round_event(const struct perf_tool *tool __maybe_unused,
 				    union perf_event *event,
 				    struct perf_sample *sample __maybe_unused,
 				    struct machine *machine __maybe_unused)
diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index 74308c1368fe..1fe521466bf4 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -60,7 +60,7 @@ static int test__thread_map(struct test_suite *test __maybe_unused, int subtest
 	return 0;
 }
 
-static int process_event(struct perf_tool *tool __maybe_unused,
+static int process_event(const struct perf_tool *tool __maybe_unused,
 			 union perf_event *event,
 			 struct perf_sample *sample __maybe_unused,
 			 struct machine *machine __maybe_unused)
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index fa40f3cb6266..138ffc71b32d 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -899,7 +899,7 @@ static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
 static int arm_spe_process_event(struct perf_session *session,
 				 union perf_event *event,
 				 struct perf_sample *sample,
-				 struct perf_tool *tool)
+				 const struct perf_tool *tool)
 {
 	int err = 0;
 	u64 timestamp;
@@ -947,7 +947,7 @@ static int arm_spe_process_event(struct perf_session *session,
 
 static int arm_spe_process_auxtrace_event(struct perf_session *session,
 					  union perf_event *event,
-					  struct perf_tool *tool __maybe_unused)
+					  const struct perf_tool *tool __maybe_unused)
 {
 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
 					     auxtrace);
@@ -985,7 +985,7 @@ static int arm_spe_process_auxtrace_event(struct perf_session *session,
 }
 
 static int arm_spe_flush(struct perf_session *session __maybe_unused,
-			 struct perf_tool *tool __maybe_unused)
+			 const struct perf_tool *tool __maybe_unused)
 {
 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
 			auxtrace);
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index e2f317063eec..cbb773ed6f1a 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -1240,7 +1240,7 @@ void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int
 }
 
 int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
-					 struct perf_tool *tool,
+					 const struct perf_tool *tool,
 					 struct perf_session *session,
 					 perf_event__handler_t process)
 {
@@ -1831,7 +1831,7 @@ int __weak compat_auxtrace_mmap__write_tail(struct auxtrace_mmap *mm, u64 tail)
 
 static int __auxtrace_mmap__read(struct mmap *map,
 				 struct auxtrace_record *itr,
-				 struct perf_tool *tool, process_auxtrace_t fn,
+				 const struct perf_tool *tool, process_auxtrace_t fn,
 				 bool snapshot, size_t snapshot_size)
 {
 	struct auxtrace_mmap *mm = &map->auxtrace_mmap;
@@ -1942,14 +1942,14 @@ static int __auxtrace_mmap__read(struct mmap *map,
 }
 
 int auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr,
-			struct perf_tool *tool, process_auxtrace_t fn)
+			const struct perf_tool *tool, process_auxtrace_t fn)
 {
 	return __auxtrace_mmap__read(map, itr, tool, fn, false, 0);
 }
 
 int auxtrace_mmap__read_snapshot(struct mmap *map,
 				 struct auxtrace_record *itr,
-				 struct perf_tool *tool, process_auxtrace_t fn,
+				 const struct perf_tool *tool, process_auxtrace_t fn,
 				 size_t snapshot_size)
 {
 	return __auxtrace_mmap__read(map, itr, tool, fn, true, snapshot_size);
@@ -2829,7 +2829,7 @@ int auxtrace_parse_filters(struct evlist *evlist)
 }
 
 int auxtrace__process_event(struct perf_session *session, union perf_event *event,
-			    struct perf_sample *sample, struct perf_tool *tool)
+			    struct perf_sample *sample, const struct perf_tool *tool)
 {
 	if (!session->auxtrace)
 		return 0;
@@ -2847,7 +2847,7 @@ void auxtrace__dump_auxtrace_sample(struct perf_session *session,
 	session->auxtrace->dump_auxtrace_sample(session, sample);
 }
 
-int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool)
+int auxtrace__flush_events(struct perf_session *session, const struct perf_tool *tool)
 {
 	if (!session->auxtrace)
 		return 0;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 8a6ec9565835..d405efcd8708 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -208,17 +208,17 @@ struct auxtrace {
 	int (*process_event)(struct perf_session *session,
 			     union perf_event *event,
 			     struct perf_sample *sample,
-			     struct perf_tool *tool);
+			     const struct perf_tool *tool);
 	int (*process_auxtrace_event)(struct perf_session *session,
 				      union perf_event *event,
-				      struct perf_tool *tool);
+				      const struct perf_tool *tool);
 	int (*queue_data)(struct perf_session *session,
 			  struct perf_sample *sample, union perf_event *event,
 			  u64 data_offset);
 	void (*dump_auxtrace_sample)(struct perf_session *session,
 				     struct perf_sample *sample);
 	int (*flush_events)(struct perf_session *session,
-			    struct perf_tool *tool);
+			    const struct perf_tool *tool);
 	void (*free_events)(struct perf_session *session);
 	void (*free)(struct perf_session *session);
 	bool (*evsel_is_auxtrace)(struct perf_session *session,
@@ -508,17 +508,17 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
 				   struct evlist *evlist,
 				   struct evsel *evsel, int idx);
 
-typedef int (*process_auxtrace_t)(struct perf_tool *tool,
+typedef int (*process_auxtrace_t)(const struct perf_tool *tool,
 				  struct mmap *map,
 				  union perf_event *event, void *data1,
 				  size_t len1, void *data2, size_t len2);
 
 int auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr,
-			struct perf_tool *tool, process_auxtrace_t fn);
+			const struct perf_tool *tool, process_auxtrace_t fn);
 
 int auxtrace_mmap__read_snapshot(struct mmap *map,
 				 struct auxtrace_record *itr,
-				 struct perf_tool *tool, process_auxtrace_t fn,
+				 const struct perf_tool *tool, process_auxtrace_t fn,
 				 size_t snapshot_size);
 
 int auxtrace_queues__init_nr(struct auxtrace_queues *queues, int nr_queues);
@@ -639,10 +639,10 @@ int addr_filters__parse_bare_filter(struct addr_filters *filts,
 int auxtrace_parse_filters(struct evlist *evlist);
 
 int auxtrace__process_event(struct perf_session *session, union perf_event *event,
-			    struct perf_sample *sample, struct perf_tool *tool);
+			    struct perf_sample *sample, const struct perf_tool *tool);
 void auxtrace__dump_auxtrace_sample(struct perf_session *session,
 				    struct perf_sample *sample);
-int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool);
+int auxtrace__flush_events(struct perf_session *session, const struct perf_tool *tool);
 void auxtrace__free_events(struct perf_session *session);
 void auxtrace__free(struct perf_session *session);
 bool auxtrace__evsel_is_auxtrace(struct perf_session *session,
@@ -809,7 +809,7 @@ static inline
 int auxtrace__process_event(struct perf_session *session __maybe_unused,
 			    union perf_event *event __maybe_unused,
 			    struct perf_sample *sample __maybe_unused,
-			    struct perf_tool *tool __maybe_unused)
+			    const struct perf_tool *tool __maybe_unused)
 {
 	return 0;
 }
@@ -822,7 +822,7 @@ void auxtrace__dump_auxtrace_sample(struct perf_session *session __maybe_unused,
 
 static inline
 int auxtrace__flush_events(struct perf_session *session __maybe_unused,
-			   struct perf_tool *tool __maybe_unused)
+			   const struct perf_tool *tool __maybe_unused)
 {
 	return 0;
 }
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 827695cd0408..13608237c50e 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -170,7 +170,7 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session,
 {
 	struct perf_record_ksymbol *ksymbol_event = &event->ksymbol;
 	struct perf_record_bpf_event *bpf_event = &event->bpf;
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct bpf_prog_info_node *info_node;
 	struct perf_bpil *info_linear;
 	struct bpf_prog_info *info;
@@ -310,7 +310,7 @@ struct kallsyms_parse {
 	union perf_event	*event;
 	perf_event__handler_t	 process;
 	struct machine		*machine;
-	struct perf_tool	*tool;
+	const struct perf_tool	*tool;
 };
 
 static int
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 83a1581e8cf1..098fcc625d91 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -42,7 +42,7 @@
 
 static bool no_buildid_cache;
 
-int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused,
+int build_id__mark_dso_hit(const struct perf_tool *tool __maybe_unused,
 			   union perf_event *event,
 			   struct perf_sample *sample,
 			   struct evsel *evsel __maybe_unused,
@@ -67,7 +67,7 @@ int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int perf_event__exit_del_thread(struct perf_tool *tool __maybe_unused,
+static int perf_event__exit_del_thread(const struct perf_tool *tool __maybe_unused,
 				       union perf_event *event,
 				       struct perf_sample *sample
 				       __maybe_unused,
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index 3fa8bffb07ca..ae87c4c58d5b 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -35,11 +35,11 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size,
 char *__dso__build_id_filename(const struct dso *dso, char *bf, size_t size,
 			       bool is_debug, bool is_kallsyms);
 
-int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event,
+int build_id__mark_dso_hit(const struct perf_tool *tool, union perf_event *event,
 			   struct perf_sample *sample, struct evsel *evsel,
 			   struct machine *machine);
 
-int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event,
+int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *event,
 			       struct perf_sample *sample, struct evsel *evsel,
 			       struct machine *machine);
 
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 566498ebc694..34552940ec1d 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -788,7 +788,7 @@ static void cs_etm__dump_event(struct cs_etm_queue *etmq,
 }
 
 static int cs_etm__flush_events(struct perf_session *session,
-				struct perf_tool *tool)
+				const struct perf_tool *tool)
 {
 	struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
 						   struct cs_etm_auxtrace,
@@ -2715,7 +2715,7 @@ static int cs_etm__process_switch_cpu_wide(struct cs_etm_auxtrace *etm,
 static int cs_etm__process_event(struct perf_session *session,
 				 union perf_event *event,
 				 struct perf_sample *sample,
-				 struct perf_tool *tool)
+				 const struct perf_tool *tool)
 {
 	struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
 						   struct cs_etm_auxtrace,
@@ -2785,7 +2785,7 @@ static void dump_queued_data(struct cs_etm_auxtrace *etm,
 
 static int cs_etm__process_auxtrace_event(struct perf_session *session,
 					  union perf_event *event,
-					  struct perf_tool *tool __maybe_unused)
+					  const struct perf_tool *tool __maybe_unused)
 {
 	struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
 						   struct cs_etm_auxtrace,
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 2b732bccabad..9e2170604b66 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -792,7 +792,7 @@ static bool is_flush_needed(struct ctf_stream *cs)
 	return cs->count >= STREAM_FLUSH_COUNT;
 }
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *_event,
 				struct perf_sample *sample,
 				struct evsel *evsel,
@@ -871,7 +871,7 @@ do {							\
 } while(0)
 
 #define __FUNC_PROCESS_NON_SAMPLE(_name, body) 	\
-static int process_##_name##_event(struct perf_tool *tool,	\
+static int process_##_name##_event(const struct perf_tool *tool,	\
 				   union perf_event *_event,	\
 				   struct perf_sample *sample,	\
 				   struct machine *machine)	\
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 3cf64f5b23ee..905ea9823f9d 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -118,7 +118,7 @@ static void output_json_key_format(FILE *out, bool comma, int depth,
 	va_end(args);
 }
 
-static void output_sample_callchain_entry(struct perf_tool *tool,
+static void output_sample_callchain_entry(const struct perf_tool *tool,
 		u64 ip, struct addr_location *al)
 {
 	struct convert_json *c = container_of(tool, struct convert_json, tool);
@@ -146,7 +146,7 @@ static void output_sample_callchain_entry(struct perf_tool *tool,
 	output_json_format(out, false, 4, "}");
 }
 
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(const struct perf_tool *tool,
 				union perf_event *event __maybe_unused,
 				struct perf_sample *sample,
 				struct evsel *evsel __maybe_unused,
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index f32f9abf6344..c2f0e7f40ad5 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -216,7 +216,7 @@ size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp)
 		       event->cgroup.id, event->cgroup.path);
 }
 
-int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
+int perf_event__process_comm(const struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine)
@@ -224,7 +224,7 @@ int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
 	return machine__process_comm_event(machine, event, sample);
 }
 
-int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused,
+int perf_event__process_namespaces(const struct perf_tool *tool __maybe_unused,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine)
@@ -232,7 +232,7 @@ int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused,
 	return machine__process_namespaces_event(machine, event, sample);
 }
 
-int perf_event__process_cgroup(struct perf_tool *tool __maybe_unused,
+int perf_event__process_cgroup(const struct perf_tool *tool __maybe_unused,
 			       union perf_event *event,
 			       struct perf_sample *sample,
 			       struct machine *machine)
@@ -240,7 +240,7 @@ int perf_event__process_cgroup(struct perf_tool *tool __maybe_unused,
 	return machine__process_cgroup_event(machine, event, sample);
 }
 
-int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
+int perf_event__process_lost(const struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine)
@@ -248,7 +248,7 @@ int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
 	return machine__process_lost_event(machine, event, sample);
 }
 
-int perf_event__process_aux(struct perf_tool *tool __maybe_unused,
+int perf_event__process_aux(const struct perf_tool *tool __maybe_unused,
 			    union perf_event *event,
 			    struct perf_sample *sample __maybe_unused,
 			    struct machine *machine)
@@ -256,7 +256,7 @@ int perf_event__process_aux(struct perf_tool *tool __maybe_unused,
 	return machine__process_aux_event(machine, event);
 }
 
-int perf_event__process_itrace_start(struct perf_tool *tool __maybe_unused,
+int perf_event__process_itrace_start(const struct perf_tool *tool __maybe_unused,
 				     union perf_event *event,
 				     struct perf_sample *sample __maybe_unused,
 				     struct machine *machine)
@@ -264,7 +264,7 @@ int perf_event__process_itrace_start(struct perf_tool *tool __maybe_unused,
 	return machine__process_itrace_start_event(machine, event);
 }
 
-int perf_event__process_aux_output_hw_id(struct perf_tool *tool __maybe_unused,
+int perf_event__process_aux_output_hw_id(const struct perf_tool *tool __maybe_unused,
 					 union perf_event *event,
 					 struct perf_sample *sample __maybe_unused,
 					 struct machine *machine)
@@ -272,7 +272,7 @@ int perf_event__process_aux_output_hw_id(struct perf_tool *tool __maybe_unused,
 	return machine__process_aux_output_hw_id_event(machine, event);
 }
 
-int perf_event__process_lost_samples(struct perf_tool *tool __maybe_unused,
+int perf_event__process_lost_samples(const struct perf_tool *tool __maybe_unused,
 				     union perf_event *event,
 				     struct perf_sample *sample,
 				     struct machine *machine)
@@ -280,7 +280,7 @@ int perf_event__process_lost_samples(struct perf_tool *tool __maybe_unused,
 	return machine__process_lost_samples_event(machine, event, sample);
 }
 
-int perf_event__process_switch(struct perf_tool *tool __maybe_unused,
+int perf_event__process_switch(const struct perf_tool *tool __maybe_unused,
 			       union perf_event *event,
 			       struct perf_sample *sample __maybe_unused,
 			       struct machine *machine)
@@ -288,7 +288,7 @@ int perf_event__process_switch(struct perf_tool *tool __maybe_unused,
 	return machine__process_switch_event(machine, event);
 }
 
-int perf_event__process_ksymbol(struct perf_tool *tool __maybe_unused,
+int perf_event__process_ksymbol(const struct perf_tool *tool __maybe_unused,
 				union perf_event *event,
 				struct perf_sample *sample __maybe_unused,
 				struct machine *machine)
@@ -296,7 +296,7 @@ int perf_event__process_ksymbol(struct perf_tool *tool __maybe_unused,
 	return machine__process_ksymbol(machine, event, sample);
 }
 
-int perf_event__process_bpf(struct perf_tool *tool __maybe_unused,
+int perf_event__process_bpf(const struct perf_tool *tool __maybe_unused,
 			    union perf_event *event,
 			    struct perf_sample *sample,
 			    struct machine *machine)
@@ -304,7 +304,7 @@ int perf_event__process_bpf(struct perf_tool *tool __maybe_unused,
 	return machine__process_bpf(machine, event, sample);
 }
 
-int perf_event__process_text_poke(struct perf_tool *tool __maybe_unused,
+int perf_event__process_text_poke(const struct perf_tool *tool __maybe_unused,
 				  union perf_event *event,
 				  struct perf_sample *sample,
 				  struct machine *machine)
@@ -387,7 +387,7 @@ size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp)
 	return ret;
 }
 
-int perf_event__process_mmap(struct perf_tool *tool __maybe_unused,
+int perf_event__process_mmap(const struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine)
@@ -395,7 +395,7 @@ int perf_event__process_mmap(struct perf_tool *tool __maybe_unused,
 	return machine__process_mmap_event(machine, event, sample);
 }
 
-int perf_event__process_mmap2(struct perf_tool *tool __maybe_unused,
+int perf_event__process_mmap2(const struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine)
@@ -410,7 +410,7 @@ size_t perf_event__fprintf_task(union perf_event *event, FILE *fp)
 		       event->fork.ppid, event->fork.ptid);
 }
 
-int perf_event__process_fork(struct perf_tool *tool __maybe_unused,
+int perf_event__process_fork(const struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine)
@@ -418,7 +418,7 @@ int perf_event__process_fork(struct perf_tool *tool __maybe_unused,
 	return machine__process_fork_event(machine, event, sample);
 }
 
-int perf_event__process_exit(struct perf_tool *tool __maybe_unused,
+int perf_event__process_exit(const struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine)
@@ -587,7 +587,7 @@ size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FIL
 	return ret;
 }
 
-int perf_event__process(struct perf_tool *tool __maybe_unused,
+int perf_event__process(const struct perf_tool *tool __maybe_unused,
 			union perf_event *event,
 			struct perf_sample *sample,
 			struct machine *machine)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index d8bcee2e9b93..4b24f1c580fd 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -267,71 +267,71 @@ struct perf_tool;
 void perf_event__read_stat_config(struct perf_stat_config *config,
 				  struct perf_record_stat_config *event);
 
-int perf_event__process_comm(struct perf_tool *tool,
+int perf_event__process_comm(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
-int perf_event__process_lost(struct perf_tool *tool,
+int perf_event__process_lost(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
-int perf_event__process_lost_samples(struct perf_tool *tool,
+int perf_event__process_lost_samples(const struct perf_tool *tool,
 				     union perf_event *event,
 				     struct perf_sample *sample,
 				     struct machine *machine);
-int perf_event__process_aux(struct perf_tool *tool,
+int perf_event__process_aux(const struct perf_tool *tool,
 			    union perf_event *event,
 			    struct perf_sample *sample,
 			    struct machine *machine);
-int perf_event__process_itrace_start(struct perf_tool *tool,
+int perf_event__process_itrace_start(const struct perf_tool *tool,
 				     union perf_event *event,
 				     struct perf_sample *sample,
 				     struct machine *machine);
-int perf_event__process_aux_output_hw_id(struct perf_tool *tool,
+int perf_event__process_aux_output_hw_id(const struct perf_tool *tool,
 					 union perf_event *event,
 					 struct perf_sample *sample,
 					 struct machine *machine);
-int perf_event__process_switch(struct perf_tool *tool,
+int perf_event__process_switch(const struct perf_tool *tool,
 			       union perf_event *event,
 			       struct perf_sample *sample,
 			       struct machine *machine);
-int perf_event__process_namespaces(struct perf_tool *tool,
+int perf_event__process_namespaces(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine);
-int perf_event__process_cgroup(struct perf_tool *tool,
+int perf_event__process_cgroup(const struct perf_tool *tool,
 			       union perf_event *event,
 			       struct perf_sample *sample,
 			       struct machine *machine);
-int perf_event__process_mmap(struct perf_tool *tool,
+int perf_event__process_mmap(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
-int perf_event__process_mmap2(struct perf_tool *tool,
+int perf_event__process_mmap2(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
-int perf_event__process_fork(struct perf_tool *tool,
+int perf_event__process_fork(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
-int perf_event__process_exit(struct perf_tool *tool,
+int perf_event__process_exit(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
-int perf_event__process_ksymbol(struct perf_tool *tool,
+int perf_event__process_ksymbol(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct machine *machine);
-int perf_event__process_bpf(struct perf_tool *tool,
+int perf_event__process_bpf(const struct perf_tool *tool,
 			    union perf_event *event,
 			    struct perf_sample *sample,
 			    struct machine *machine);
-int perf_event__process_text_poke(struct perf_tool *tool,
+int perf_event__process_text_poke(const struct perf_tool *tool,
 				  union perf_event *event,
 				  struct perf_sample *sample,
 				  struct machine *machine);
-int perf_event__process(struct perf_tool *tool,
+int perf_event__process(const struct perf_tool *tool,
 			union perf_event *event,
 			struct perf_sample *sample,
 			struct machine *machine);
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 55e9553861d0..3309fe7f1d12 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -4326,7 +4326,7 @@ out_delete_evlist:
 int perf_event__process_feature(struct perf_session *session,
 				union perf_event *event)
 {
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct feat_fd ff = { .fd = 0 };
 	struct perf_record_header_feature *fe = (struct perf_record_header_feature *)event;
 	int type = fe->header.type;
@@ -4405,7 +4405,7 @@ size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp)
 	return ret;
 }
 
-int perf_event__process_attr(struct perf_tool *tool __maybe_unused,
+int perf_event__process_attr(const struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct evlist **pevlist)
 {
@@ -4444,7 +4444,7 @@ int perf_event__process_attr(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
+int perf_event__process_event_update(const struct perf_tool *tool __maybe_unused,
 				     union perf_event *event,
 				     struct evlist **pevlist)
 {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 7c16a250e738..07ff647197ff 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -156,9 +156,9 @@ int perf_header__fprintf_info(struct perf_session *s, FILE *fp, bool full);
 
 int perf_event__process_feature(struct perf_session *session,
 				union perf_event *event);
-int perf_event__process_attr(struct perf_tool *tool, union perf_event *event,
+int perf_event__process_attr(const struct perf_tool *tool, union perf_event *event,
 			     struct evlist **pevlist);
-int perf_event__process_event_update(struct perf_tool *tool,
+int perf_event__process_event_update(const struct perf_tool *tool,
 				     union perf_event *event,
 				     struct evlist **pevlist);
 size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp);
diff --git a/tools/perf/util/hisi-ptt.c b/tools/perf/util/hisi-ptt.c
index 37ea987017f6..e4cc4785f744 100644
--- a/tools/perf/util/hisi-ptt.c
+++ b/tools/perf/util/hisi-ptt.c
@@ -79,14 +79,14 @@ static void hisi_ptt_dump_event(struct hisi_ptt *ptt, unsigned char *buf,
 static int hisi_ptt_process_event(struct perf_session *session __maybe_unused,
 				  union perf_event *event __maybe_unused,
 				  struct perf_sample *sample __maybe_unused,
-				  struct perf_tool *tool __maybe_unused)
+				  const struct perf_tool *tool __maybe_unused)
 {
 	return 0;
 }
 
 static int hisi_ptt_process_auxtrace_event(struct perf_session *session,
 					   union perf_event *event,
-					   struct perf_tool *tool __maybe_unused)
+					   const struct perf_tool *tool __maybe_unused)
 {
 	struct hisi_ptt *ptt = container_of(session->auxtrace, struct hisi_ptt,
 					    auxtrace);
@@ -123,7 +123,7 @@ static int hisi_ptt_process_auxtrace_event(struct perf_session *session,
 }
 
 static int hisi_ptt_flush(struct perf_session *session __maybe_unused,
-			  struct perf_tool *tool __maybe_unused)
+			  const struct perf_tool *tool __maybe_unused)
 {
 	return 0;
 }
diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index 160d346cb450..27d9b5c9fec8 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c
@@ -591,7 +591,7 @@ static int intel_bts_process_queues(struct intel_bts *bts, u64 timestamp)
 static int intel_bts_process_event(struct perf_session *session,
 				   union perf_event *event,
 				   struct perf_sample *sample,
-				   struct perf_tool *tool)
+				   const struct perf_tool *tool)
 {
 	struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts,
 					     auxtrace);
@@ -634,7 +634,7 @@ static int intel_bts_process_event(struct perf_session *session,
 
 static int intel_bts_process_auxtrace_event(struct perf_session *session,
 					    union perf_event *event,
-					    struct perf_tool *tool __maybe_unused)
+					    const struct perf_tool *tool __maybe_unused)
 {
 	struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts,
 					     auxtrace);
@@ -675,7 +675,7 @@ static int intel_bts_process_auxtrace_event(struct perf_session *session,
 }
 
 static int intel_bts_flush(struct perf_session *session,
-			   struct perf_tool *tool __maybe_unused)
+			   const struct perf_tool *tool __maybe_unused)
 {
 	struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts,
 					     auxtrace);
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 1fc03bfd14c3..fd2597613f3d 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -3449,7 +3449,7 @@ out:
 static int intel_pt_process_event(struct perf_session *session,
 				  union perf_event *event,
 				  struct perf_sample *sample,
-				  struct perf_tool *tool)
+				  const struct perf_tool *tool)
 {
 	struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt,
 					   auxtrace);
@@ -3533,7 +3533,7 @@ static int intel_pt_process_event(struct perf_session *session,
 	return err;
 }
 
-static int intel_pt_flush(struct perf_session *session, struct perf_tool *tool)
+static int intel_pt_flush(struct perf_session *session, const struct perf_tool *tool)
 {
 	struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt,
 					   auxtrace);
@@ -3600,7 +3600,7 @@ static bool intel_pt_evsel_is_auxtrace(struct perf_session *session,
 
 static int intel_pt_process_auxtrace_event(struct perf_session *session,
 					   union perf_event *event,
-					   struct perf_tool *tool __maybe_unused)
+					   const struct perf_tool *tool __maybe_unused)
 {
 	struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt,
 					   auxtrace);
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index 1f657ef8975f..5ce13653512b 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -424,7 +424,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
 {
 	struct perf_sample sample;
 	union perf_event *event;
-	struct perf_tool *tool = jd->session->tool;
+	const struct perf_tool *tool = jd->session->tool;
 	uint64_t code, addr;
 	uintptr_t uaddr;
 	char *filename;
@@ -543,7 +543,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
 {
 	struct perf_sample sample;
 	union perf_event *event;
-	struct perf_tool *tool = jd->session->tool;
+	const struct perf_tool *tool = jd->session->tool;
 	char *filename;
 	size_t size;
 	struct stat st;
diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c
index 4ec583e511af..73846b73d0cf 100644
--- a/tools/perf/util/s390-cpumsf.c
+++ b/tools/perf/util/s390-cpumsf.c
@@ -912,7 +912,7 @@ static int
 s390_cpumsf_process_event(struct perf_session *session,
 			  union perf_event *event,
 			  struct perf_sample *sample,
-			  struct perf_tool *tool)
+			  const struct perf_tool *tool)
 {
 	struct s390_cpumsf *sf = container_of(session->auxtrace,
 					      struct s390_cpumsf,
@@ -955,7 +955,7 @@ s390_cpumsf_process_event(struct perf_session *session,
 static int
 s390_cpumsf_process_auxtrace_event(struct perf_session *session,
 				   union perf_event *event __maybe_unused,
-				   struct perf_tool *tool __maybe_unused)
+				   const struct perf_tool *tool __maybe_unused)
 {
 	struct s390_cpumsf *sf = container_of(session->auxtrace,
 					      struct s390_cpumsf,
@@ -998,7 +998,7 @@ static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused)
 }
 
 static int s390_cpumsf_flush(struct perf_session *session __maybe_unused,
-			     struct perf_tool *tool __maybe_unused)
+			     const struct perf_tool *tool __maybe_unused)
 {
 	return 0;
 }
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index aa02c1d7b1f9..41beebd0913c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -103,7 +103,7 @@ static int perf_session__process_compressed_event(struct perf_session *session,
 
 static int perf_session__deliver_event(struct perf_session *session,
 				       union perf_event *event,
-				       struct perf_tool *tool,
+				       const struct perf_tool *tool,
 				       u64 file_offset,
 				       const char *file_path);
 
@@ -330,7 +330,7 @@ static int process_event_synth_tracing_data_stub(struct perf_session *session
 	return 0;
 }
 
-static int process_event_synth_attr_stub(struct perf_tool *tool __maybe_unused,
+static int process_event_synth_attr_stub(const struct perf_tool *tool __maybe_unused,
 					 union perf_event *event __maybe_unused,
 					 struct evlist **pevlist
 					 __maybe_unused)
@@ -339,7 +339,7 @@ static int process_event_synth_attr_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_event_synth_event_update_stub(struct perf_tool *tool __maybe_unused,
+static int process_event_synth_event_update_stub(const struct perf_tool *tool __maybe_unused,
 						 union perf_event *event __maybe_unused,
 						 struct evlist **pevlist
 						 __maybe_unused)
@@ -351,7 +351,7 @@ static int process_event_synth_event_update_stub(struct perf_tool *tool __maybe_
 	return 0;
 }
 
-static int process_event_sample_stub(struct perf_tool *tool __maybe_unused,
+static int process_event_sample_stub(const struct perf_tool *tool __maybe_unused,
 				     union perf_event *event __maybe_unused,
 				     struct perf_sample *sample __maybe_unused,
 				     struct evsel *evsel __maybe_unused,
@@ -361,7 +361,7 @@ static int process_event_sample_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_event_stub(struct perf_tool *tool __maybe_unused,
+static int process_event_stub(const struct perf_tool *tool __maybe_unused,
 			      union perf_event *event __maybe_unused,
 			      struct perf_sample *sample __maybe_unused,
 			      struct machine *machine __maybe_unused)
@@ -370,7 +370,7 @@ static int process_event_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
+static int process_finished_round_stub(const struct perf_tool *tool __maybe_unused,
 				       union perf_event *event __maybe_unused,
 				       struct ordered_events *oe __maybe_unused)
 {
@@ -1078,7 +1078,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
  *      Flush every events below timestamp 7
  *      etc...
  */
-int perf_event__process_finished_round(struct perf_tool *tool __maybe_unused,
+int perf_event__process_finished_round(const struct perf_tool *tool __maybe_unused,
 				       union perf_event *event __maybe_unused,
 				       struct ordered_events *oe)
 {
@@ -1472,7 +1472,7 @@ static struct machine *machines__find_for_cpumode(struct machines *machines,
 }
 
 static int deliver_sample_value(struct evlist *evlist,
-				struct perf_tool *tool,
+				const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct sample_read_value *v,
@@ -1504,7 +1504,7 @@ static int deliver_sample_value(struct evlist *evlist,
 }
 
 static int deliver_sample_group(struct evlist *evlist,
-				struct perf_tool *tool,
+				const struct perf_tool *tool,
 				union  perf_event *event,
 				struct perf_sample *sample,
 				struct machine *machine,
@@ -1526,7 +1526,7 @@ static int deliver_sample_group(struct evlist *evlist,
 	return ret;
 }
 
-static int evlist__deliver_sample(struct evlist *evlist, struct perf_tool *tool,
+static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool *tool,
 				  union  perf_event *event, struct perf_sample *sample,
 				  struct evsel *evsel, struct machine *machine)
 {
@@ -1551,7 +1551,7 @@ static int machines__deliver_event(struct machines *machines,
 				   struct evlist *evlist,
 				   union perf_event *event,
 				   struct perf_sample *sample,
-				   struct perf_tool *tool, u64 file_offset,
+				   const struct perf_tool *tool, u64 file_offset,
 				   const char *file_path)
 {
 	struct evsel *evsel;
@@ -1639,7 +1639,7 @@ static int machines__deliver_event(struct machines *machines,
 
 static int perf_session__deliver_event(struct perf_session *session,
 				       union perf_event *event,
-				       struct perf_tool *tool,
+				       const struct perf_tool *tool,
 				       u64 file_offset,
 				       const char *file_path)
 {
@@ -1672,7 +1672,7 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 					    const char *file_path)
 {
 	struct ordered_events *oe = &session->ordered_events;
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct perf_sample sample = { .time = 0, };
 	int fd = perf_data__fd(session->data);
 	int err;
@@ -1759,7 +1759,7 @@ int perf_session__deliver_synth_event(struct perf_session *session,
 				      struct perf_sample *sample)
 {
 	struct evlist *evlist = session->evlist;
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 
 	events_stats__inc(&evlist->stats, event->header.type);
 
@@ -1891,7 +1891,7 @@ static s64 perf_session__process_event(struct perf_session *session,
 				       const char *file_path)
 {
 	struct evlist *evlist = session->evlist;
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	int ret;
 
 	if (session->header.needs_swap)
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index cc28976bb00e..6886cc85600f 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -163,7 +163,7 @@ int perf_session__dsos_hit_all(struct perf_session *session);
 int perf_event__process_id_index(struct perf_session *session,
 				 union perf_event *event);
 
-int perf_event__process_finished_round(struct perf_tool *tool,
+int perf_event__process_finished_round(const struct perf_tool *tool,
 				       union perf_event *event,
 				       struct ordered_events *oe);
 
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 5498048f56ea..7f884d70de81 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-only 
+// SPDX-License-Identifier: GPL-2.0-only
 
 #include "util/cgroup.h"
 #include "util/data.h"
@@ -47,7 +47,7 @@
 
 unsigned int proc_map_timeout = DEFAULT_PROC_MAP_PARSE_TIMEOUT;
 
-int perf_tool__process_synth_event(struct perf_tool *tool,
+int perf_tool__process_synth_event(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct machine *machine,
 				   perf_event__handler_t process)
@@ -187,7 +187,7 @@ static int perf_event__prepare_comm(union perf_event *event, pid_t pid, pid_t ti
 	return 0;
 }
 
-pid_t perf_event__synthesize_comm(struct perf_tool *tool,
+pid_t perf_event__synthesize_comm(const struct perf_tool *tool,
 					 union perf_event *event, pid_t pid,
 					 perf_event__handler_t process,
 					 struct machine *machine)
@@ -218,7 +218,7 @@ static void perf_event__get_ns_link_info(pid_t pid, const char *ns,
 	}
 }
 
-int perf_event__synthesize_namespaces(struct perf_tool *tool,
+int perf_event__synthesize_namespaces(const struct perf_tool *tool,
 				      union perf_event *event,
 				      pid_t pid, pid_t tgid,
 				      perf_event__handler_t process,
@@ -257,7 +257,7 @@ int perf_event__synthesize_namespaces(struct perf_tool *tool,
 	return 0;
 }
 
-static int perf_event__synthesize_fork(struct perf_tool *tool,
+static int perf_event__synthesize_fork(const struct perf_tool *tool,
 				       union perf_event *event,
 				       pid_t pid, pid_t tgid, pid_t ppid,
 				       perf_event__handler_t process,
@@ -418,7 +418,7 @@ out:
 	dso__put(dso);
 }
 
-int perf_event__synthesize_mmap_events(struct perf_tool *tool,
+int perf_event__synthesize_mmap_events(const struct perf_tool *tool,
 				       union perf_event *event,
 				       pid_t pid, pid_t tgid,
 				       perf_event__handler_t process,
@@ -542,7 +542,7 @@ out:
 }
 
 #ifdef HAVE_FILE_HANDLE
-static int perf_event__synthesize_cgroup(struct perf_tool *tool,
+static int perf_event__synthesize_cgroup(const struct perf_tool *tool,
 					 union perf_event *event,
 					 char *path, size_t mount_len,
 					 perf_event__handler_t process,
@@ -582,7 +582,7 @@ static int perf_event__synthesize_cgroup(struct perf_tool *tool,
 	return 0;
 }
 
-static int perf_event__walk_cgroup_tree(struct perf_tool *tool,
+static int perf_event__walk_cgroup_tree(const struct perf_tool *tool,
 					union perf_event *event,
 					char *path, size_t mount_len,
 					perf_event__handler_t process,
@@ -630,7 +630,7 @@ static int perf_event__walk_cgroup_tree(struct perf_tool *tool,
 	return ret;
 }
 
-int perf_event__synthesize_cgroups(struct perf_tool *tool,
+int perf_event__synthesize_cgroups(const struct perf_tool *tool,
 				   perf_event__handler_t process,
 				   struct machine *machine)
 {
@@ -657,7 +657,7 @@ int perf_event__synthesize_cgroups(struct perf_tool *tool,
 	return 0;
 }
 #else
-int perf_event__synthesize_cgroups(struct perf_tool *tool __maybe_unused,
+int perf_event__synthesize_cgroups(const struct perf_tool *tool __maybe_unused,
 				   perf_event__handler_t process __maybe_unused,
 				   struct machine *machine __maybe_unused)
 {
@@ -666,7 +666,7 @@ int perf_event__synthesize_cgroups(struct perf_tool *tool __maybe_unused,
 #endif
 
 struct perf_event__synthesize_modules_maps_cb_args {
-	struct perf_tool *tool;
+	const struct perf_tool *tool;
 	perf_event__handler_t process;
 	struct machine *machine;
 	union perf_event *event;
@@ -717,7 +717,7 @@ static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data)
 	return 0;
 }
 
-int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process,
+int perf_event__synthesize_modules(const struct perf_tool *tool, perf_event__handler_t process,
 				   struct machine *machine)
 {
 	int rc;
@@ -763,7 +763,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
 				      union perf_event *fork_event,
 				      union perf_event *namespaces_event,
 				      pid_t pid, int full, perf_event__handler_t process,
-				      struct perf_tool *tool, struct machine *machine,
+				      const struct perf_tool *tool, struct machine *machine,
 				      bool needs_mmap, bool mmap_data)
 {
 	char filename[PATH_MAX];
@@ -852,7 +852,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
 	return rc;
 }
 
-int perf_event__synthesize_thread_map(struct perf_tool *tool,
+int perf_event__synthesize_thread_map(const struct perf_tool *tool,
 				      struct perf_thread_map *threads,
 				      perf_event__handler_t process,
 				      struct machine *machine,
@@ -929,7 +929,7 @@ out:
 	return err;
 }
 
-static int __perf_event__synthesize_threads(struct perf_tool *tool,
+static int __perf_event__synthesize_threads(const struct perf_tool *tool,
 					    perf_event__handler_t process,
 					    struct machine *machine,
 					    bool needs_mmap,
@@ -993,7 +993,7 @@ out:
 }
 
 struct synthesize_threads_arg {
-	struct perf_tool *tool;
+	const struct perf_tool *tool;
 	perf_event__handler_t process;
 	struct machine *machine;
 	bool needs_mmap;
@@ -1015,7 +1015,7 @@ static void *synthesize_threads_worker(void *arg)
 	return NULL;
 }
 
-int perf_event__synthesize_threads(struct perf_tool *tool,
+int perf_event__synthesize_threads(const struct perf_tool *tool,
 				   perf_event__handler_t process,
 				   struct machine *machine,
 				   bool needs_mmap, bool mmap_data,
@@ -1104,14 +1104,14 @@ free_dirent:
 	return err;
 }
 
-int __weak perf_event__synthesize_extra_kmaps(struct perf_tool *tool __maybe_unused,
+int __weak perf_event__synthesize_extra_kmaps(const struct perf_tool *tool __maybe_unused,
 					      perf_event__handler_t process __maybe_unused,
 					      struct machine *machine __maybe_unused)
 {
 	return 0;
 }
 
-static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
+static int __perf_event__synthesize_kernel_mmap(const struct perf_tool *tool,
 						perf_event__handler_t process,
 						struct machine *machine)
 {
@@ -1183,7 +1183,7 @@ static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
 	return err;
 }
 
-int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
+int perf_event__synthesize_kernel_mmap(const struct perf_tool *tool,
 				       perf_event__handler_t process,
 				       struct machine *machine)
 {
@@ -1196,7 +1196,7 @@ int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
 	return perf_event__synthesize_extra_kmaps(tool, process, machine);
 }
 
-int perf_event__synthesize_thread_map2(struct perf_tool *tool,
+int perf_event__synthesize_thread_map2(const struct perf_tool *tool,
 				      struct perf_thread_map *threads,
 				      perf_event__handler_t process,
 				      struct machine *machine)
@@ -1346,7 +1346,7 @@ static struct perf_record_cpu_map *cpu_map_event__new(const struct perf_cpu_map
 }
 
 
-int perf_event__synthesize_cpu_map(struct perf_tool *tool,
+int perf_event__synthesize_cpu_map(const struct perf_tool *tool,
 				   const struct perf_cpu_map *map,
 				   perf_event__handler_t process,
 				   struct machine *machine)
@@ -1364,7 +1364,7 @@ int perf_event__synthesize_cpu_map(struct perf_tool *tool,
 	return err;
 }
 
-int perf_event__synthesize_stat_config(struct perf_tool *tool,
+int perf_event__synthesize_stat_config(const struct perf_tool *tool,
 				       struct perf_stat_config *config,
 				       perf_event__handler_t process,
 				       struct machine *machine)
@@ -1403,7 +1403,7 @@ int perf_event__synthesize_stat_config(struct perf_tool *tool,
 	return err;
 }
 
-int perf_event__synthesize_stat(struct perf_tool *tool,
+int perf_event__synthesize_stat(const struct perf_tool *tool,
 				struct perf_cpu cpu, u32 thread, u64 id,
 				struct perf_counts_values *count,
 				perf_event__handler_t process,
@@ -1425,7 +1425,7 @@ int perf_event__synthesize_stat(struct perf_tool *tool,
 	return process(tool, (union perf_event *) &event, NULL, machine);
 }
 
-int perf_event__synthesize_stat_round(struct perf_tool *tool,
+int perf_event__synthesize_stat_round(const struct perf_tool *tool,
 				      u64 evtime, u64 type,
 				      perf_event__handler_t process,
 				      struct machine *machine)
@@ -1826,7 +1826,7 @@ int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_s
 	return (void *)array - (void *)start;
 }
 
-int __perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process,
+int __perf_event__synthesize_id_index(const struct perf_tool *tool, perf_event__handler_t process,
 				      struct evlist *evlist, struct machine *machine, size_t from)
 {
 	union perf_event *ev;
@@ -1918,13 +1918,13 @@ out_err:
 	return err;
 }
 
-int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process,
+int perf_event__synthesize_id_index(const struct perf_tool *tool, perf_event__handler_t process,
 				    struct evlist *evlist, struct machine *machine)
 {
 	return __perf_event__synthesize_id_index(tool, process, evlist, machine, 0);
 }
 
-int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
+int __machine__synthesize_threads(struct machine *machine, const struct perf_tool *tool,
 				  struct target *target, struct perf_thread_map *threads,
 				  perf_event__handler_t process, bool needs_mmap,
 				  bool data_mmap, unsigned int nr_threads_synthesize)
@@ -1985,7 +1985,7 @@ static struct perf_record_event_update *event_update_event__new(size_t size, u64
 	return ev;
 }
 
-int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evsel *evsel,
+int perf_event__synthesize_event_update_unit(const struct perf_tool *tool, struct evsel *evsel,
 					     perf_event__handler_t process)
 {
 	size_t size = strlen(evsel->unit);
@@ -2002,7 +2002,7 @@ int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evse
 	return err;
 }
 
-int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel,
+int perf_event__synthesize_event_update_scale(const struct perf_tool *tool, struct evsel *evsel,
 					      perf_event__handler_t process)
 {
 	struct perf_record_event_update *ev;
@@ -2019,7 +2019,7 @@ int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evs
 	return err;
 }
 
-int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel,
+int perf_event__synthesize_event_update_name(const struct perf_tool *tool, struct evsel *evsel,
 					     perf_event__handler_t process)
 {
 	struct perf_record_event_update *ev;
@@ -2036,7 +2036,7 @@ int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evse
 	return err;
 }
 
-int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel,
+int perf_event__synthesize_event_update_cpus(const struct perf_tool *tool, struct evsel *evsel,
 					     perf_event__handler_t process)
 {
 	struct synthesize_cpu_map_data syn_data = { .map = evsel->core.own_cpus };
@@ -2059,7 +2059,7 @@ int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evse
 	return err;
 }
 
-int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist,
+int perf_event__synthesize_attrs(const struct perf_tool *tool, struct evlist *evlist,
 				 perf_event__handler_t process)
 {
 	struct evsel *evsel;
@@ -2087,7 +2087,7 @@ static bool has_scale(struct evsel *evsel)
 	return evsel->scale != 1;
 }
 
-int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evsel_list,
+int perf_event__synthesize_extra_attr(const struct perf_tool *tool, struct evlist *evsel_list,
 				      perf_event__handler_t process, bool is_pipe)
 {
 	struct evsel *evsel;
@@ -2143,7 +2143,7 @@ int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evs
 	return 0;
 }
 
-int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr,
+int perf_event__synthesize_attr(const struct perf_tool *tool, struct perf_event_attr *attr,
 				u32 ids, u64 *id, perf_event__handler_t process)
 {
 	union perf_event *ev;
@@ -2177,7 +2177,7 @@ int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *
 }
 
 #ifdef HAVE_LIBTRACEEVENT
-int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist,
+int perf_event__synthesize_tracing_data(const struct perf_tool *tool, int fd, struct evlist *evlist,
 					perf_event__handler_t process)
 {
 	union perf_event ev;
@@ -2225,7 +2225,7 @@ int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct e
 }
 #endif
 
-int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc,
+int perf_event__synthesize_build_id(const struct perf_tool *tool, struct dso *pos, u16 misc,
 				    perf_event__handler_t process, struct machine *machine)
 {
 	union perf_event ev;
@@ -2249,7 +2249,7 @@ int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16
 	return process(tool, &ev, NULL, machine);
 }
 
-int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool,
+int perf_event__synthesize_stat_events(struct perf_stat_config *config, const struct perf_tool *tool,
 				       struct evlist *evlist, perf_event__handler_t process, bool attrs)
 {
 	int err;
@@ -2286,7 +2286,7 @@ int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct p
 
 extern const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE];
 
-int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session,
+int perf_event__synthesize_features(const struct perf_tool *tool, struct perf_session *session,
 				    struct evlist *evlist, perf_event__handler_t process)
 {
 	struct perf_header *header = &session->header;
@@ -2349,7 +2349,7 @@ int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session
 	return ret;
 }
 
-int perf_event__synthesize_for_pipe(struct perf_tool *tool,
+int perf_event__synthesize_for_pipe(const struct perf_tool *tool,
 				    struct perf_session *session,
 				    struct perf_data *data,
 				    perf_event__handler_t process)
diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index 53737d1619a4..31df7653677f 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h
@@ -40,45 +40,45 @@ enum perf_record_synth {
 
 int parse_synth_opt(char *str);
 
-typedef int (*perf_event__handler_t)(struct perf_tool *tool, union perf_event *event,
+typedef int (*perf_event__handler_t)(const struct perf_tool *tool, union perf_event *event,
 				     struct perf_sample *sample, struct machine *machine);
 
-int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process);
-int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process);
-int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_cpu_map(struct perf_tool *tool, const struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
-int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
-int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
-int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
-int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evsel_list, perf_event__handler_t process, bool is_pipe);
-int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process);
-int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine);
-int __perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine, size_t from);
+int perf_event__synthesize_attrs(const struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process);
+int perf_event__synthesize_attr(const struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process);
+int perf_event__synthesize_build_id(const struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_cpu_map(const struct perf_tool *tool, const struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_event_update_cpus(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
+int perf_event__synthesize_event_update_name(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
+int perf_event__synthesize_event_update_scale(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
+int perf_event__synthesize_event_update_unit(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
+int perf_event__synthesize_extra_attr(const struct perf_tool *tool, struct evlist *evsel_list, perf_event__handler_t process, bool is_pipe);
+int perf_event__synthesize_extra_kmaps(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_features(const struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process);
+int perf_event__synthesize_id_index(const struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine);
+int __perf_event__synthesize_id_index(const struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine, size_t from);
 int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_sample *sample);
-int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_mmap_events(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data);
-int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_namespaces(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_cgroups(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_kernel_mmap(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_mmap_events(const struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data);
+int perf_event__synthesize_modules(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_namespaces(const struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_cgroups(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_format, const struct perf_sample *sample);
-int perf_event__synthesize_stat_config(struct perf_tool *tool, struct perf_stat_config *config, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process, bool attrs);
-int perf_event__synthesize_stat_round(struct perf_tool *tool, u64 time, u64 type, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_stat(struct perf_tool *tool, struct perf_cpu cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_thread_map2(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_thread_map(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data);
-int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data, unsigned int nr_threads_synthesize);
-int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process);
-int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
-pid_t perf_event__synthesize_comm(struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine);
-
-int perf_tool__process_synth_event(struct perf_tool *tool, union perf_event *event, struct machine *machine, perf_event__handler_t process);
+int perf_event__synthesize_stat_config(const struct perf_tool *tool, struct perf_stat_config *config, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_stat_events(struct perf_stat_config *config, const struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process, bool attrs);
+int perf_event__synthesize_stat_round(const struct perf_tool *tool, u64 time, u64 type, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_stat(const struct perf_tool *tool, struct perf_cpu cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_thread_map2(const struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_thread_map(const struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data);
+int perf_event__synthesize_threads(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data, unsigned int nr_threads_synthesize);
+int perf_event__synthesize_tracing_data(const struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process);
+int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+pid_t perf_event__synthesize_comm(const struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine);
+
+int perf_tool__process_synth_event(const struct perf_tool *tool, union perf_event *event, struct machine *machine, perf_event__handler_t process);
 
 size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, u64 read_format);
 
-int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
+int __machine__synthesize_threads(struct machine *machine, const struct perf_tool *tool,
 				  struct target *target, struct perf_thread_map *threads,
 				  perf_event__handler_t process, bool needs_mmap, bool data_mmap,
 				  unsigned int nr_threads_synthesize);
@@ -87,7 +87,7 @@ int machine__synthesize_threads(struct machine *machine, struct target *target,
 				unsigned int nr_threads_synthesize);
 
 #ifdef HAVE_AUXTRACE_SUPPORT
-int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct perf_tool *tool,
+int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, const struct perf_tool *tool,
 					 struct perf_session *session, perf_event__handler_t process);
 
 #else // HAVE_AUXTRACE_SUPPORT
@@ -96,7 +96,7 @@ int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct per
 
 static inline int
 perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused,
-				     struct perf_tool *tool __maybe_unused,
+				     const struct perf_tool *tool __maybe_unused,
 				     struct perf_session *session __maybe_unused,
 				     perf_event__handler_t process __maybe_unused)
 {
@@ -117,7 +117,7 @@ static inline int perf_event__synthesize_bpf_events(struct perf_session *session
 }
 #endif // HAVE_LIBBPF_SUPPORT
 
-int perf_event__synthesize_for_pipe(struct perf_tool *tool,
+int perf_event__synthesize_for_pipe(const struct perf_tool *tool,
 				    struct perf_session *session,
 				    struct perf_data *data,
 				    perf_event__handler_t process);
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 62bbc9cec151..2b12c674768f 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -15,14 +15,14 @@ struct perf_tool;
 struct machine;
 struct ordered_events;
 
-typedef int (*event_sample)(struct perf_tool *tool, union perf_event *event,
+typedef int (*event_sample)(const struct perf_tool *tool, union perf_event *event,
 			    struct perf_sample *sample,
 			    struct evsel *evsel, struct machine *machine);
 
-typedef int (*event_op)(struct perf_tool *tool, union perf_event *event,
+typedef int (*event_op)(const struct perf_tool *tool, union perf_event *event,
 			struct perf_sample *sample, struct machine *machine);
 
-typedef int (*event_attr_op)(struct perf_tool *tool,
+typedef int (*event_attr_op)(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct evlist **pevlist);
 
@@ -31,7 +31,7 @@ typedef s64 (*event_op3)(struct perf_session *session, union perf_event *event);
 typedef int (*event_op4)(struct perf_session *session, union perf_event *event, u64 data,
 			 const char *str);
 
-typedef int (*event_oe)(struct perf_tool *tool, union perf_event *event,
+typedef int (*event_oe)(const struct perf_tool *tool, union perf_event *event,
 			struct ordered_events *oe);
 
 enum show_feature_header {
diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c
index f19791d46e99..2e33a20e1e1b 100644
--- a/tools/perf/util/tsc.c
+++ b/tools/perf/util/tsc.c
@@ -72,7 +72,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
 }
 
 int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc,
-				struct perf_tool *tool,
+				const struct perf_tool *tool,
 				perf_event__handler_t process,
 				struct machine *machine)
 {
-- 
cgit v1.2.3


From 564e5cbcfdf5c55491461edc61f6b7e2a4418063 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:46:56 -0700
Subject: perf tool: Move fill defaults into tool.c

The aim here is to eventually make perf_tool__fill_defaults() an init
function so that the tools struct is more const.

Create a tool.c to go along with tool.h. Move perf_tool__fill_defaults()
out of session.c into tool.c along with the default stub values. Add
perf_tool__compressed_is_stub() for a test in
perf_session__process_user_event().

perf_session__process_compressed_event() is only used from being default
initialized so migrate into tool.c.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-5-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build     |   1 +
 tools/perf/util/session.c | 310 +------------------------------------------
 tools/perf/util/session.h |   2 -
 tools/perf/util/tool.c    | 325 ++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/tool.h    |   4 +
 5 files changed, 331 insertions(+), 311 deletions(-)
 create mode 100644 tools/perf/util/tool.c

(limited to 'tools')

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index b24360c04aae..f6f18b4228bb 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -66,6 +66,7 @@ perf-util-y += map.o
 perf-util-y += maps.o
 perf-util-y += pstack.o
 perf-util-y += session.o
+perf-util-y += tool.o
 perf-util-y += sample-raw.o
 perf-util-y += s390-sample-raw.o
 perf-util-y += amd-sample-raw.o
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 41beebd0913c..5c0e0341d39e 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -39,68 +39,6 @@
 #include "annotate.h"
 #include <internal/lib.h>
 
-#ifdef HAVE_ZSTD_SUPPORT
-static int perf_session__process_compressed_event(struct perf_session *session,
-						  union perf_event *event, u64 file_offset,
-						  const char *file_path)
-{
-	void *src;
-	size_t decomp_size, src_size;
-	u64 decomp_last_rem = 0;
-	size_t mmap_len, decomp_len = session->header.env.comp_mmap_len;
-	struct decomp *decomp, *decomp_last = session->active_decomp->decomp_last;
-
-	if (decomp_last) {
-		decomp_last_rem = decomp_last->size - decomp_last->head;
-		decomp_len += decomp_last_rem;
-	}
-
-	mmap_len = sizeof(struct decomp) + decomp_len;
-	decomp = mmap(NULL, mmap_len, PROT_READ|PROT_WRITE,
-		      MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-	if (decomp == MAP_FAILED) {
-		pr_err("Couldn't allocate memory for decompression\n");
-		return -1;
-	}
-
-	decomp->file_pos = file_offset;
-	decomp->file_path = file_path;
-	decomp->mmap_len = mmap_len;
-	decomp->head = 0;
-
-	if (decomp_last_rem) {
-		memcpy(decomp->data, &(decomp_last->data[decomp_last->head]), decomp_last_rem);
-		decomp->size = decomp_last_rem;
-	}
-
-	src = (void *)event + sizeof(struct perf_record_compressed);
-	src_size = event->pack.header.size - sizeof(struct perf_record_compressed);
-
-	decomp_size = zstd_decompress_stream(session->active_decomp->zstd_decomp, src, src_size,
-				&(decomp->data[decomp_last_rem]), decomp_len - decomp_last_rem);
-	if (!decomp_size) {
-		munmap(decomp, mmap_len);
-		pr_err("Couldn't decompress data\n");
-		return -1;
-	}
-
-	decomp->size += decomp_size;
-
-	if (session->active_decomp->decomp == NULL)
-		session->active_decomp->decomp = decomp;
-	else
-		session->active_decomp->decomp_last->next = decomp;
-
-	session->active_decomp->decomp_last = decomp;
-
-	pr_debug("decomp (B): %zd to %zd\n", src_size, decomp_size);
-
-	return 0;
-}
-#else /* !HAVE_ZSTD_SUPPORT */
-#define perf_session__process_compressed_event perf_session__process_compressed_event_stub
-#endif
-
 static int perf_session__deliver_event(struct perf_session *session,
 				       union perf_event *event,
 				       const struct perf_tool *tool,
@@ -321,251 +259,6 @@ void perf_session__delete(struct perf_session *session)
 	free(session);
 }
 
-static int process_event_synth_tracing_data_stub(struct perf_session *session
-						 __maybe_unused,
-						 union perf_event *event
-						 __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_event_synth_attr_stub(const struct perf_tool *tool __maybe_unused,
-					 union perf_event *event __maybe_unused,
-					 struct evlist **pevlist
-					 __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_event_synth_event_update_stub(const struct perf_tool *tool __maybe_unused,
-						 union perf_event *event __maybe_unused,
-						 struct evlist **pevlist
-						 __maybe_unused)
-{
-	if (dump_trace)
-		perf_event__fprintf_event_update(event, stdout);
-
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_event_sample_stub(const struct perf_tool *tool __maybe_unused,
-				     union perf_event *event __maybe_unused,
-				     struct perf_sample *sample __maybe_unused,
-				     struct evsel *evsel __maybe_unused,
-				     struct machine *machine __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_event_stub(const struct perf_tool *tool __maybe_unused,
-			      union perf_event *event __maybe_unused,
-			      struct perf_sample *sample __maybe_unused,
-			      struct machine *machine __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_finished_round_stub(const struct perf_tool *tool __maybe_unused,
-				       union perf_event *event __maybe_unused,
-				       struct ordered_events *oe __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int skipn(int fd, off_t n)
-{
-	char buf[4096];
-	ssize_t ret;
-
-	while (n > 0) {
-		ret = read(fd, buf, min(n, (off_t)sizeof(buf)));
-		if (ret <= 0)
-			return ret;
-		n -= ret;
-	}
-
-	return 0;
-}
-
-static s64 process_event_auxtrace_stub(struct perf_session *session __maybe_unused,
-				       union perf_event *event)
-{
-	dump_printf(": unhandled!\n");
-	if (perf_data__is_pipe(session->data))
-		skipn(perf_data__fd(session->data), event->auxtrace.size);
-	return event->auxtrace.size;
-}
-
-static int process_event_op2_stub(struct perf_session *session __maybe_unused,
-				  union perf_event *event __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-
-static
-int process_event_thread_map_stub(struct perf_session *session __maybe_unused,
-				  union perf_event *event __maybe_unused)
-{
-	if (dump_trace)
-		perf_event__fprintf_thread_map(event, stdout);
-
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static
-int process_event_cpu_map_stub(struct perf_session *session __maybe_unused,
-			       union perf_event *event __maybe_unused)
-{
-	if (dump_trace)
-		perf_event__fprintf_cpu_map(event, stdout);
-
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static
-int process_event_stat_config_stub(struct perf_session *session __maybe_unused,
-				   union perf_event *event __maybe_unused)
-{
-	if (dump_trace)
-		perf_event__fprintf_stat_config(event, stdout);
-
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_stat_stub(struct perf_session *perf_session __maybe_unused,
-			     union perf_event *event)
-{
-	if (dump_trace)
-		perf_event__fprintf_stat(event, stdout);
-
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_stat_round_stub(struct perf_session *perf_session __maybe_unused,
-				   union perf_event *event)
-{
-	if (dump_trace)
-		perf_event__fprintf_stat_round(event, stdout);
-
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_event_time_conv_stub(struct perf_session *perf_session __maybe_unused,
-					union perf_event *event)
-{
-	if (dump_trace)
-		perf_event__fprintf_time_conv(event, stdout);
-
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int perf_session__process_compressed_event_stub(struct perf_session *session __maybe_unused,
-						       union perf_event *event __maybe_unused,
-						       u64 file_offset __maybe_unused,
-						       const char *file_path __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
-void perf_tool__fill_defaults(struct perf_tool *tool)
-{
-	if (tool->sample == NULL)
-		tool->sample = process_event_sample_stub;
-	if (tool->mmap == NULL)
-		tool->mmap = process_event_stub;
-	if (tool->mmap2 == NULL)
-		tool->mmap2 = process_event_stub;
-	if (tool->comm == NULL)
-		tool->comm = process_event_stub;
-	if (tool->namespaces == NULL)
-		tool->namespaces = process_event_stub;
-	if (tool->cgroup == NULL)
-		tool->cgroup = process_event_stub;
-	if (tool->fork == NULL)
-		tool->fork = process_event_stub;
-	if (tool->exit == NULL)
-		tool->exit = process_event_stub;
-	if (tool->lost == NULL)
-		tool->lost = perf_event__process_lost;
-	if (tool->lost_samples == NULL)
-		tool->lost_samples = perf_event__process_lost_samples;
-	if (tool->aux == NULL)
-		tool->aux = perf_event__process_aux;
-	if (tool->itrace_start == NULL)
-		tool->itrace_start = perf_event__process_itrace_start;
-	if (tool->context_switch == NULL)
-		tool->context_switch = perf_event__process_switch;
-	if (tool->ksymbol == NULL)
-		tool->ksymbol = perf_event__process_ksymbol;
-	if (tool->bpf == NULL)
-		tool->bpf = perf_event__process_bpf;
-	if (tool->text_poke == NULL)
-		tool->text_poke = perf_event__process_text_poke;
-	if (tool->aux_output_hw_id == NULL)
-		tool->aux_output_hw_id = perf_event__process_aux_output_hw_id;
-	if (tool->read == NULL)
-		tool->read = process_event_sample_stub;
-	if (tool->throttle == NULL)
-		tool->throttle = process_event_stub;
-	if (tool->unthrottle == NULL)
-		tool->unthrottle = process_event_stub;
-	if (tool->attr == NULL)
-		tool->attr = process_event_synth_attr_stub;
-	if (tool->event_update == NULL)
-		tool->event_update = process_event_synth_event_update_stub;
-	if (tool->tracing_data == NULL)
-		tool->tracing_data = process_event_synth_tracing_data_stub;
-	if (tool->build_id == NULL)
-		tool->build_id = process_event_op2_stub;
-	if (tool->finished_round == NULL) {
-		if (tool->ordered_events)
-			tool->finished_round = perf_event__process_finished_round;
-		else
-			tool->finished_round = process_finished_round_stub;
-	}
-	if (tool->id_index == NULL)
-		tool->id_index = process_event_op2_stub;
-	if (tool->auxtrace_info == NULL)
-		tool->auxtrace_info = process_event_op2_stub;
-	if (tool->auxtrace == NULL)
-		tool->auxtrace = process_event_auxtrace_stub;
-	if (tool->auxtrace_error == NULL)
-		tool->auxtrace_error = process_event_op2_stub;
-	if (tool->thread_map == NULL)
-		tool->thread_map = process_event_thread_map_stub;
-	if (tool->cpu_map == NULL)
-		tool->cpu_map = process_event_cpu_map_stub;
-	if (tool->stat_config == NULL)
-		tool->stat_config = process_event_stat_config_stub;
-	if (tool->stat == NULL)
-		tool->stat = process_stat_stub;
-	if (tool->stat_round == NULL)
-		tool->stat_round = process_stat_round_stub;
-	if (tool->time_conv == NULL)
-		tool->time_conv = process_event_time_conv_stub;
-	if (tool->feature == NULL)
-		tool->feature = process_event_op2_stub;
-	if (tool->compressed == NULL)
-		tool->compressed = perf_session__process_compressed_event;
-	if (tool->finished_init == NULL)
-		tool->finished_init = process_event_op2_stub;
-}
-
 static void swap_sample_id_all(union perf_event *event, void *data)
 {
 	void *end = (void *) event + event->header.size;
@@ -1677,8 +1370,7 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 	int fd = perf_data__fd(session->data);
 	int err;
 
-	if (event->header.type != PERF_RECORD_COMPRESSED ||
-	    tool->compressed == perf_session__process_compressed_event_stub)
+	if (event->header.type != PERF_RECORD_COMPRESSED || perf_tool__compressed_is_stub(tool))
 		dump_event(session->evlist, event, file_offset, &sample, file_path);
 
 	/* These events are processed right away */
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 6886cc85600f..7f69baeae7fb 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -92,8 +92,6 @@ int perf_session__process_events(struct perf_session *session);
 int perf_session__queue_event(struct perf_session *s, union perf_event *event,
 			      u64 timestamp, u64 file_offset, const char *file_path);
 
-void perf_tool__fill_defaults(struct perf_tool *tool);
-
 int perf_session__resolve_callchain(struct perf_session *session,
 				    struct evsel *evsel,
 				    struct thread *thread,
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
new file mode 100644
index 000000000000..17219ecb8fa6
--- /dev/null
+++ b/tools/perf/util/tool.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "data.h"
+#include "debug.h"
+#include "header.h"
+#include "session.h"
+#include "stat.h"
+#include "tool.h"
+#include "tsc.h"
+#include <sys/mman.h>
+#include <unistd.h>
+
+#ifdef HAVE_ZSTD_SUPPORT
+static int perf_session__process_compressed_event(struct perf_session *session,
+						  union perf_event *event, u64 file_offset,
+						  const char *file_path)
+{
+	void *src;
+	size_t decomp_size, src_size;
+	u64 decomp_last_rem = 0;
+	size_t mmap_len, decomp_len = session->header.env.comp_mmap_len;
+	struct decomp *decomp, *decomp_last = session->active_decomp->decomp_last;
+
+	if (decomp_last) {
+		decomp_last_rem = decomp_last->size - decomp_last->head;
+		decomp_len += decomp_last_rem;
+	}
+
+	mmap_len = sizeof(struct decomp) + decomp_len;
+	decomp = mmap(NULL, mmap_len, PROT_READ|PROT_WRITE,
+		      MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+	if (decomp == MAP_FAILED) {
+		pr_err("Couldn't allocate memory for decompression\n");
+		return -1;
+	}
+
+	decomp->file_pos = file_offset;
+	decomp->file_path = file_path;
+	decomp->mmap_len = mmap_len;
+	decomp->head = 0;
+
+	if (decomp_last_rem) {
+		memcpy(decomp->data, &(decomp_last->data[decomp_last->head]), decomp_last_rem);
+		decomp->size = decomp_last_rem;
+	}
+
+	src = (void *)event + sizeof(struct perf_record_compressed);
+	src_size = event->pack.header.size - sizeof(struct perf_record_compressed);
+
+	decomp_size = zstd_decompress_stream(session->active_decomp->zstd_decomp, src, src_size,
+				&(decomp->data[decomp_last_rem]), decomp_len - decomp_last_rem);
+	if (!decomp_size) {
+		munmap(decomp, mmap_len);
+		pr_err("Couldn't decompress data\n");
+		return -1;
+	}
+
+	decomp->size += decomp_size;
+
+	if (session->active_decomp->decomp == NULL)
+		session->active_decomp->decomp = decomp;
+	else
+		session->active_decomp->decomp_last->next = decomp;
+
+	session->active_decomp->decomp_last = decomp;
+
+	pr_debug("decomp (B): %zd to %zd\n", src_size, decomp_size);
+
+	return 0;
+}
+#endif
+
+static int process_event_synth_tracing_data_stub(struct perf_session *session
+						 __maybe_unused,
+						 union perf_event *event
+						 __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_event_synth_attr_stub(const struct perf_tool *tool __maybe_unused,
+					 union perf_event *event __maybe_unused,
+					 struct evlist **pevlist
+					 __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_event_synth_event_update_stub(const struct perf_tool *tool __maybe_unused,
+						 union perf_event *event __maybe_unused,
+						 struct evlist **pevlist
+						 __maybe_unused)
+{
+	if (dump_trace)
+		perf_event__fprintf_event_update(event, stdout);
+
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_event_sample_stub(const struct perf_tool *tool __maybe_unused,
+				     union perf_event *event __maybe_unused,
+				     struct perf_sample *sample __maybe_unused,
+				     struct evsel *evsel __maybe_unused,
+				     struct machine *machine __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_event_stub(const struct perf_tool *tool __maybe_unused,
+			      union perf_event *event __maybe_unused,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_finished_round_stub(const struct perf_tool *tool __maybe_unused,
+				       union perf_event *event __maybe_unused,
+				       struct ordered_events *oe __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int skipn(int fd, off_t n)
+{
+	char buf[4096];
+	ssize_t ret;
+
+	while (n > 0) {
+		ret = read(fd, buf, min(n, (off_t)sizeof(buf)));
+		if (ret <= 0)
+			return ret;
+		n -= ret;
+	}
+
+	return 0;
+}
+
+static s64 process_event_auxtrace_stub(struct perf_session *session __maybe_unused,
+				       union perf_event *event)
+{
+	dump_printf(": unhandled!\n");
+	if (perf_data__is_pipe(session->data))
+		skipn(perf_data__fd(session->data), event->auxtrace.size);
+	return event->auxtrace.size;
+}
+
+static int process_event_op2_stub(struct perf_session *session __maybe_unused,
+				  union perf_event *event __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+
+static
+int process_event_thread_map_stub(struct perf_session *session __maybe_unused,
+				  union perf_event *event __maybe_unused)
+{
+	if (dump_trace)
+		perf_event__fprintf_thread_map(event, stdout);
+
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static
+int process_event_cpu_map_stub(struct perf_session *session __maybe_unused,
+			       union perf_event *event __maybe_unused)
+{
+	if (dump_trace)
+		perf_event__fprintf_cpu_map(event, stdout);
+
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static
+int process_event_stat_config_stub(struct perf_session *session __maybe_unused,
+				   union perf_event *event __maybe_unused)
+{
+	if (dump_trace)
+		perf_event__fprintf_stat_config(event, stdout);
+
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_stat_stub(struct perf_session *perf_session __maybe_unused,
+			     union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_stat(event, stdout);
+
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_stat_round_stub(struct perf_session *perf_session __maybe_unused,
+				   union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_stat_round(event, stdout);
+
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_event_time_conv_stub(struct perf_session *perf_session __maybe_unused,
+					union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_time_conv(event, stdout);
+
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int perf_session__process_compressed_event_stub(struct perf_session *session __maybe_unused,
+						       union perf_event *event __maybe_unused,
+						       u64 file_offset __maybe_unused,
+						       const char *file_path __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+void perf_tool__fill_defaults(struct perf_tool *tool)
+{
+	if (tool->sample == NULL)
+		tool->sample = process_event_sample_stub;
+	if (tool->mmap == NULL)
+		tool->mmap = process_event_stub;
+	if (tool->mmap2 == NULL)
+		tool->mmap2 = process_event_stub;
+	if (tool->comm == NULL)
+		tool->comm = process_event_stub;
+	if (tool->namespaces == NULL)
+		tool->namespaces = process_event_stub;
+	if (tool->cgroup == NULL)
+		tool->cgroup = process_event_stub;
+	if (tool->fork == NULL)
+		tool->fork = process_event_stub;
+	if (tool->exit == NULL)
+		tool->exit = process_event_stub;
+	if (tool->lost == NULL)
+		tool->lost = perf_event__process_lost;
+	if (tool->lost_samples == NULL)
+		tool->lost_samples = perf_event__process_lost_samples;
+	if (tool->aux == NULL)
+		tool->aux = perf_event__process_aux;
+	if (tool->itrace_start == NULL)
+		tool->itrace_start = perf_event__process_itrace_start;
+	if (tool->context_switch == NULL)
+		tool->context_switch = perf_event__process_switch;
+	if (tool->ksymbol == NULL)
+		tool->ksymbol = perf_event__process_ksymbol;
+	if (tool->bpf == NULL)
+		tool->bpf = perf_event__process_bpf;
+	if (tool->text_poke == NULL)
+		tool->text_poke = perf_event__process_text_poke;
+	if (tool->aux_output_hw_id == NULL)
+		tool->aux_output_hw_id = perf_event__process_aux_output_hw_id;
+	if (tool->read == NULL)
+		tool->read = process_event_sample_stub;
+	if (tool->throttle == NULL)
+		tool->throttle = process_event_stub;
+	if (tool->unthrottle == NULL)
+		tool->unthrottle = process_event_stub;
+	if (tool->attr == NULL)
+		tool->attr = process_event_synth_attr_stub;
+	if (tool->event_update == NULL)
+		tool->event_update = process_event_synth_event_update_stub;
+	if (tool->tracing_data == NULL)
+		tool->tracing_data = process_event_synth_tracing_data_stub;
+	if (tool->build_id == NULL)
+		tool->build_id = process_event_op2_stub;
+	if (tool->finished_round == NULL) {
+		if (tool->ordered_events)
+			tool->finished_round = perf_event__process_finished_round;
+		else
+			tool->finished_round = process_finished_round_stub;
+	}
+	if (tool->id_index == NULL)
+		tool->id_index = process_event_op2_stub;
+	if (tool->auxtrace_info == NULL)
+		tool->auxtrace_info = process_event_op2_stub;
+	if (tool->auxtrace == NULL)
+		tool->auxtrace = process_event_auxtrace_stub;
+	if (tool->auxtrace_error == NULL)
+		tool->auxtrace_error = process_event_op2_stub;
+	if (tool->thread_map == NULL)
+		tool->thread_map = process_event_thread_map_stub;
+	if (tool->cpu_map == NULL)
+		tool->cpu_map = process_event_cpu_map_stub;
+	if (tool->stat_config == NULL)
+		tool->stat_config = process_event_stat_config_stub;
+	if (tool->stat == NULL)
+		tool->stat = process_stat_stub;
+	if (tool->stat_round == NULL)
+		tool->stat_round = process_stat_round_stub;
+	if (tool->time_conv == NULL)
+		tool->time_conv = process_event_time_conv_stub;
+	if (tool->feature == NULL)
+		tool->feature = process_event_op2_stub;
+	if (tool->compressed == NULL) {
+#ifdef HAVE_ZSTD_SUPPORT
+		tool->compressed = perf_session__process_compressed_event;
+#else
+		tool->compressed = perf_session__process_compressed_event_stub;
+#endif
+	}
+	if (tool->finished_init == NULL)
+		tool->finished_init = process_event_op2_stub;
+}
+
+bool perf_tool__compressed_is_stub(const struct perf_tool *tool)
+{
+	return tool->compressed == perf_session__process_compressed_event_stub;
+}
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 2b12c674768f..b917b1fa6204 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -89,4 +89,8 @@ struct perf_tool {
 	enum show_feature_header show_feat_hdr;
 };
 
+void perf_tool__fill_defaults(struct perf_tool *tool);
+
+bool perf_tool__compressed_is_stub(const struct perf_tool *tool);
+
 #endif /* __PERF_TOOL_H */
-- 
cgit v1.2.3


From ae737b61029ced77391e2b850433cd316716ddf9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:46:57 -0700
Subject: perf tool: Add perf_tool__init()

Add init function that behaves like perf_tool__fill_defaults() but
assumes all values haven't been initialized.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-6-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/tool.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/tool.h |  1 +
 2 files changed, 59 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index 17219ecb8fa6..cbd9b888bd73 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -230,6 +230,64 @@ static int perf_session__process_compressed_event_stub(struct perf_session *sess
 	return 0;
 }
 
+void perf_tool__init(struct perf_tool *tool, bool ordered_events)
+{
+	tool->ordered_events = ordered_events;
+	tool->ordering_requires_timestamps = false;
+	tool->namespace_events = false;
+	tool->cgroup_events = false;
+	tool->no_warn = false;
+	tool->show_feat_hdr = SHOW_FEAT_NO_HEADER;
+
+	tool->sample = process_event_sample_stub;
+	tool->mmap = process_event_stub;
+	tool->mmap2 = process_event_stub;
+	tool->comm = process_event_stub;
+	tool->namespaces = process_event_stub;
+	tool->cgroup = process_event_stub;
+	tool->fork = process_event_stub;
+	tool->exit = process_event_stub;
+	tool->lost = perf_event__process_lost;
+	tool->lost_samples = perf_event__process_lost_samples;
+	tool->aux = perf_event__process_aux;
+	tool->itrace_start = perf_event__process_itrace_start;
+	tool->context_switch = perf_event__process_switch;
+	tool->ksymbol = perf_event__process_ksymbol;
+	tool->bpf = perf_event__process_bpf;
+	tool->text_poke = perf_event__process_text_poke;
+	tool->aux_output_hw_id = perf_event__process_aux_output_hw_id;
+	tool->read = process_event_sample_stub;
+	tool->throttle = process_event_stub;
+	tool->unthrottle = process_event_stub;
+	tool->attr = process_event_synth_attr_stub;
+	tool->event_update = process_event_synth_event_update_stub;
+	tool->tracing_data = process_event_synth_tracing_data_stub;
+	tool->build_id = process_event_op2_stub;
+
+	if (ordered_events)
+		tool->finished_round = perf_event__process_finished_round;
+	else
+		tool->finished_round = process_finished_round_stub;
+
+	tool->id_index = process_event_op2_stub;
+	tool->auxtrace_info = process_event_op2_stub;
+	tool->auxtrace = process_event_auxtrace_stub;
+	tool->auxtrace_error = process_event_op2_stub;
+	tool->thread_map = process_event_thread_map_stub;
+	tool->cpu_map = process_event_cpu_map_stub;
+	tool->stat_config = process_event_stat_config_stub;
+	tool->stat = process_stat_stub;
+	tool->stat_round = process_stat_round_stub;
+	tool->time_conv = process_event_time_conv_stub;
+	tool->feature = process_event_op2_stub;
+#ifdef HAVE_ZSTD_SUPPORT
+	tool->compressed = perf_session__process_compressed_event;
+#else
+	tool->compressed = perf_session__process_compressed_event_stub;
+#endif
+	tool->finished_init = process_event_op2_stub;
+}
+
 void perf_tool__fill_defaults(struct perf_tool *tool)
 {
 	if (tool->sample == NULL)
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index b917b1fa6204..7c7ce395e573 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -89,6 +89,7 @@ struct perf_tool {
 	enum show_feature_header show_feat_hdr;
 };
 
+void perf_tool__init(struct perf_tool *tool, bool ordered_events);
 void perf_tool__fill_defaults(struct perf_tool *tool);
 
 bool perf_tool__compressed_is_stub(const struct perf_tool *tool);
-- 
cgit v1.2.3


From f32b37cc783ac28a58ab75542de7cec2c7c01e74 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:46:58 -0700
Subject: perf kmem: Use perf_tool__init

Reduce the scope of the tool from global/static to just that of the
cmd_kmem function where the session is scoped. Use the perf_tool__init()
to initialize default values.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-7-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kmem.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 859ff018eace..b3cbac40b8c7 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -986,15 +986,6 @@ static int process_sample_event(const struct perf_tool *tool __maybe_unused,
 	return err;
 }
 
-static struct perf_tool perf_kmem = {
-	.sample		 = process_sample_event,
-	.comm		 = perf_event__process_comm,
-	.mmap		 = perf_event__process_mmap,
-	.mmap2		 = perf_event__process_mmap2,
-	.namespaces	 = perf_event__process_namespaces,
-	.ordered_events	 = true,
-};
-
 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
 {
 	if (n_alloc == 0)
@@ -1971,6 +1962,7 @@ int cmd_kmem(int argc, const char **argv)
 		NULL
 	};
 	struct perf_session *session;
+	struct perf_tool perf_kmem;
 	static const char errmsg[] = "No %s allocation events found.  Have you run 'perf kmem record --%s'?\n";
 	int ret = perf_config(kmem_config, NULL);
 
@@ -1998,6 +1990,13 @@ int cmd_kmem(int argc, const char **argv)
 
 	data.path = input_name;
 
+	perf_tool__init(&perf_kmem, /*ordered_events=*/true);
+	perf_kmem.sample	= process_sample_event;
+	perf_kmem.comm		= perf_event__process_comm;
+	perf_kmem.mmap		= perf_event__process_mmap;
+	perf_kmem.mmap2		= perf_event__process_mmap2;
+	perf_kmem.namespaces	= perf_event__process_namespaces;
+
 	kmem_session = session = perf_session__new(&data, &perf_kmem);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
-- 
cgit v1.2.3


From 584a268f50758a35b7176ecad798bbe77d672d21 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:46:59 -0700
Subject: perf buildid-list: Use perf_tool__init

Reduce scope of build_id__mark_dso_hit_ops() to the scope of function
perf_session__list_build_ids, its only use, and use perf_tool__init()
for the default values. Move perf_event__exit_del_thread() to event.[ch]
so it can be used in builtin-buildid-list.c.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-8-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-buildid-list.c | 10 ++++++++++
 tools/perf/util/build-id.c        | 32 --------------------------------
 tools/perf/util/build-id.h        |  4 +---
 tools/perf/util/event.c           | 20 ++++++++++++++++++++
 tools/perf/util/event.h           |  4 ++++
 5 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index 383d5de36ce4..52dfacaff8e3 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -89,6 +89,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
 		.mode  = PERF_DATA_MODE_READ,
 		.force = force,
 	};
+	struct perf_tool build_id__mark_dso_hit_ops;
 
 	symbol__elf_init();
 	/*
@@ -97,6 +98,15 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
 	if (filename__fprintf_build_id(input_name, stdout) > 0)
 		goto out;
 
+	perf_tool__init(&build_id__mark_dso_hit_ops, /*ordered_events=*/true);
+	build_id__mark_dso_hit_ops.sample	= build_id__mark_dso_hit;
+	build_id__mark_dso_hit_ops.mmap		= perf_event__process_mmap;
+	build_id__mark_dso_hit_ops.mmap2	= perf_event__process_mmap2;
+	build_id__mark_dso_hit_ops.fork		= perf_event__process_fork;
+	build_id__mark_dso_hit_ops.exit		= perf_event__exit_del_thread;
+	build_id__mark_dso_hit_ops.attr		= perf_event__process_attr;
+	build_id__mark_dso_hit_ops.build_id	= perf_event__process_build_id;
+
 	session = perf_session__new(&data, &build_id__mark_dso_hit_ops);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 098fcc625d91..451d145fa4ed 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -67,38 +67,6 @@ int build_id__mark_dso_hit(const struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int perf_event__exit_del_thread(const struct perf_tool *tool __maybe_unused,
-				       union perf_event *event,
-				       struct perf_sample *sample
-				       __maybe_unused,
-				       struct machine *machine)
-{
-	struct thread *thread = machine__findnew_thread(machine,
-							event->fork.pid,
-							event->fork.tid);
-
-	dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
-		    event->fork.ppid, event->fork.ptid);
-
-	if (thread) {
-		machine__remove_thread(machine, thread);
-		thread__put(thread);
-	}
-
-	return 0;
-}
-
-struct perf_tool build_id__mark_dso_hit_ops = {
-	.sample	= build_id__mark_dso_hit,
-	.mmap	= perf_event__process_mmap,
-	.mmap2	= perf_event__process_mmap2,
-	.fork	= perf_event__process_fork,
-	.exit	= perf_event__exit_del_thread,
-	.attr		 = perf_event__process_attr,
-	.build_id	 = perf_event__process_build_id,
-	.ordered_events	 = true,
-};
-
 int build_id__sprintf(const struct build_id *build_id, char *bf)
 {
 	char *bid = bf;
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index ae87c4c58d5b..a212497bfdb0 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -16,11 +16,9 @@ struct build_id {
 	size_t	size;
 };
 
-struct nsinfo;
-
-extern struct perf_tool build_id__mark_dso_hit_ops;
 struct dso;
 struct feat_fd;
+struct nsinfo;
 
 void build_id__init(struct build_id *bid, const u8 *data, size_t size);
 int build_id__sprintf(const struct build_id *build_id, char *bf);
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index c2f0e7f40ad5..aac96d5d1917 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -426,6 +426,26 @@ int perf_event__process_exit(const struct perf_tool *tool __maybe_unused,
 	return machine__process_exit_event(machine, event, sample);
 }
 
+int perf_event__exit_del_thread(const struct perf_tool *tool __maybe_unused,
+				union perf_event *event,
+				struct perf_sample *sample __maybe_unused,
+				struct machine *machine)
+{
+	struct thread *thread = machine__findnew_thread(machine,
+							event->fork.pid,
+							event->fork.tid);
+
+	dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
+		    event->fork.ppid, event->fork.ptid);
+
+	if (thread) {
+		machine__remove_thread(machine, thread);
+		thread__put(thread);
+	}
+
+	return 0;
+}
+
 size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp)
 {
 	return fprintf(fp, " offset: %#"PRI_lx64" size: %#"PRI_lx64" flags: %#"PRI_lx64" [%s%s%s]\n",
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 4b24f1c580fd..f8742e6230a5 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -319,6 +319,10 @@ int perf_event__process_exit(const struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
+int perf_event__exit_del_thread(const struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct machine *machine);
 int perf_event__process_ksymbol(const struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
-- 
cgit v1.2.3


From a01a5ef98870bd7a72f7eee0cb6168233c41bf53 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:00 -0700
Subject: perf kvm: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-9-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kvm.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index a3b903cf4311..692267b1b7e8 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1603,19 +1603,17 @@ static int read_events(struct perf_kvm_stat *kvm)
 {
 	int ret;
 
-	struct perf_tool eops = {
-		.sample			= process_sample_event,
-		.comm			= perf_event__process_comm,
-		.namespaces		= perf_event__process_namespaces,
-		.ordered_events		= true,
-	};
 	struct perf_data file = {
 		.path  = kvm->file_name,
 		.mode  = PERF_DATA_MODE_READ,
 		.force = kvm->force,
 	};
 
-	kvm->tool = eops;
+	perf_tool__init(&kvm->tool, /*ordered_events=*/true);
+	kvm->tool.sample	= process_sample_event;
+	kvm->tool.comm		= perf_event__process_comm;
+	kvm->tool.namespaces	= perf_event__process_namespaces;
+
 	kvm->session = perf_session__new(&file, &kvm->tool);
 	if (IS_ERR(kvm->session)) {
 		pr_err("Initializing perf session failed\n");
@@ -1919,14 +1917,13 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
 
 
 	/* event handling */
+	perf_tool__init(&kvm->tool, /*ordered_events=*/true);
 	kvm->tool.sample = process_sample_event;
 	kvm->tool.comm   = perf_event__process_comm;
 	kvm->tool.exit   = perf_event__process_exit;
 	kvm->tool.fork   = perf_event__process_fork;
 	kvm->tool.lost   = process_lost_event;
 	kvm->tool.namespaces  = perf_event__process_namespaces;
-	kvm->tool.ordered_events = true;
-	perf_tool__fill_defaults(&kvm->tool);
 
 	/* set defaults */
 	kvm->display_time = 1;
-- 
cgit v1.2.3


From b4fd4d00f9d4f51eab54e5b336afe27a846f1578 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:01 -0700
Subject: perf lock: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-10-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-lock.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 6efa9d646637..2c216427e929 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -1933,22 +1933,21 @@ static bool force;
 static int __cmd_report(bool display_info)
 {
 	int err = -EINVAL;
-	struct perf_tool eops = {
-		.attr		 = perf_event__process_attr,
-		.event_update	 = process_event_update,
-		.sample		 = process_sample_event,
-		.comm		 = perf_event__process_comm,
-		.mmap		 = perf_event__process_mmap,
-		.namespaces	 = perf_event__process_namespaces,
-		.tracing_data	 = perf_event__process_tracing_data,
-		.ordered_events	 = true,
-	};
+	struct perf_tool eops;
 	struct perf_data data = {
 		.path  = input_name,
 		.mode  = PERF_DATA_MODE_READ,
 		.force = force,
 	};
 
+	perf_tool__init(&eops, /*ordered_events=*/true);
+	eops.attr		 = perf_event__process_attr;
+	eops.event_update	 = process_event_update;
+	eops.sample		 = process_sample_event;
+	eops.comm		 = perf_event__process_comm;
+	eops.mmap		 = perf_event__process_mmap;
+	eops.namespaces		 = perf_event__process_namespaces;
+	eops.tracing_data	 = perf_event__process_tracing_data;
 	session = perf_session__new(&data, &eops);
 	if (IS_ERR(session)) {
 		pr_err("Initializing perf session failed\n");
@@ -2069,15 +2068,7 @@ static int check_lock_contention_options(const struct option *options,
 static int __cmd_contention(int argc, const char **argv)
 {
 	int err = -EINVAL;
-	struct perf_tool eops = {
-		.attr		 = perf_event__process_attr,
-		.event_update	 = process_event_update,
-		.sample		 = process_sample_event,
-		.comm		 = perf_event__process_comm,
-		.mmap		 = perf_event__process_mmap,
-		.tracing_data	 = perf_event__process_tracing_data,
-		.ordered_events	 = true,
-	};
+	struct perf_tool eops;
 	struct perf_data data = {
 		.path  = input_name,
 		.mode  = PERF_DATA_MODE_READ,
@@ -2100,6 +2091,14 @@ static int __cmd_contention(int argc, const char **argv)
 
 	con.result = &lockhash_table[0];
 
+	perf_tool__init(&eops, /*ordered_events=*/true);
+	eops.attr		 = perf_event__process_attr;
+	eops.event_update	 = process_event_update;
+	eops.sample		 = process_sample_event;
+	eops.comm		 = perf_event__process_comm;
+	eops.mmap		 = perf_event__process_mmap;
+	eops.tracing_data	 = perf_event__process_tracing_data;
+
 	session = perf_session__new(use_bpf ? NULL : &data, &eops);
 	if (IS_ERR(session)) {
 		pr_err("Initializing perf session failed\n");
-- 
cgit v1.2.3


From 419cbc44f5b94c1bc6a68e4c694ee46fe3907fc0 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:02 -0700
Subject: perf evlist: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-11-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-evlist.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index 7117656939e7..818ab21c3f73 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -35,13 +35,13 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
 		.mode      = PERF_DATA_MODE_READ,
 		.force     = details->force,
 	};
-	struct perf_tool tool = {
-		/* only needed for pipe mode */
-		.attr = perf_event__process_attr,
-		.feature = process_header_feature,
-	};
+	struct perf_tool tool;
 	bool has_tracepoint = false;
 
+	perf_tool__init(&tool, /*ordered_events=*/false);
+	/* only needed for pipe mode */
+	tool.attr = perf_event__process_attr;
+	tool.feature = process_header_feature;
 	session = perf_session__new(&data, &tool);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
-- 
cgit v1.2.3


From cecb1cf154b301c6f8da434adccd7bcb855d3191 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:03 -0700
Subject: perf record: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-12-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 33 ++++++++++++++++++++-------------
 tools/perf/util/tool.c      | 10 +++++-----
 tools/perf/util/tool.h      |  6 ++++++
 3 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 46410eb3a76b..39367709fd99 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -194,6 +194,15 @@ static const char *affinity_tags[PERF_AFFINITY_MAX] = {
 	"SYS", "NODE", "CPU"
 };
 
+static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
+				  struct perf_sample *sample, struct machine *machine);
+static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
+				   struct perf_sample *sample, struct machine *machine);
+static int process_timestamp_boundary(const struct perf_tool *tool,
+				      union perf_event *event,
+				      struct perf_sample *sample,
+				      struct machine *machine);
+
 #ifndef HAVE_GETTID
 static inline pid_t gettid(void)
 {
@@ -1459,7 +1468,7 @@ static int process_buildids(struct record *rec)
 	 * first/last samples.
 	 */
 	if (rec->buildid_all && !rec->timestamp_boundary)
-		rec->tool.sample = NULL;
+		rec->tool.sample = process_event_sample_stub;
 
 	return perf_session__process_events(session);
 }
@@ -2387,6 +2396,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		signal(SIGUSR2, SIG_IGN);
 	}
 
+	perf_tool__init(tool, /*ordered_events=*/true);
+	tool->sample		= process_sample_event;
+	tool->fork		= perf_event__process_fork;
+	tool->exit		= perf_event__process_exit;
+	tool->comm		= perf_event__process_comm;
+	tool->namespaces	= perf_event__process_namespaces;
+	tool->mmap		= build_id__process_mmap;
+	tool->mmap2		= build_id__process_mmap2;
+	tool->itrace_start	= process_timestamp_boundary;
+	tool->aux		= process_timestamp_boundary;
 	session = perf_session__new(data, tool);
 	if (IS_ERR(session)) {
 		pr_err("Perf session creation failed.\n");
@@ -3327,18 +3346,6 @@ static struct record record = {
 		.ctl_fd_ack          = -1,
 		.synth               = PERF_SYNTH_ALL,
 	},
-	.tool = {
-		.sample		= process_sample_event,
-		.fork		= perf_event__process_fork,
-		.exit		= perf_event__process_exit,
-		.comm		= perf_event__process_comm,
-		.namespaces	= perf_event__process_namespaces,
-		.mmap		= build_id__process_mmap,
-		.mmap2		= build_id__process_mmap2,
-		.itrace_start	= process_timestamp_boundary,
-		.aux		= process_timestamp_boundary,
-		.ordered_events	= true,
-	},
 };
 
 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index cbd9b888bd73..a4ca16e5eefe 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -99,11 +99,11 @@ static int process_event_synth_event_update_stub(const struct perf_tool *tool __
 	return 0;
 }
 
-static int process_event_sample_stub(const struct perf_tool *tool __maybe_unused,
-				     union perf_event *event __maybe_unused,
-				     struct perf_sample *sample __maybe_unused,
-				     struct evsel *evsel __maybe_unused,
-				     struct machine *machine __maybe_unused)
+int process_event_sample_stub(const struct perf_tool *tool __maybe_unused,
+			      union perf_event *event __maybe_unused,
+			      struct perf_sample *sample __maybe_unused,
+			      struct evsel *evsel __maybe_unused,
+			      struct machine *machine __maybe_unused)
 {
 	dump_printf(": unhandled!\n");
 	return 0;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 7c7ce395e573..c7fea58729a5 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -94,4 +94,10 @@ void perf_tool__fill_defaults(struct perf_tool *tool);
 
 bool perf_tool__compressed_is_stub(const struct perf_tool *tool);
 
+int process_event_sample_stub(const struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample,
+			      struct evsel *evsel,
+			      struct machine *machine);
+
 #endif /* __PERF_TOOL_H */
-- 
cgit v1.2.3


From 6bfb6df8663f3baf901f6540bd3f95ccb86c2b9b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:04 -0700
Subject: perf c2c: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-13-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-c2c.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index 88c131d05186..cd2bd573bfc3 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -385,24 +385,6 @@ free_mi:
 	goto out;
 }
 
-static struct perf_c2c c2c = {
-	.tool = {
-		.sample		= process_sample_event,
-		.mmap		= perf_event__process_mmap,
-		.mmap2		= perf_event__process_mmap2,
-		.comm		= perf_event__process_comm,
-		.exit		= perf_event__process_exit,
-		.fork		= perf_event__process_fork,
-		.lost		= perf_event__process_lost,
-		.attr		= perf_event__process_attr,
-		.auxtrace_info  = perf_event__process_auxtrace_info,
-		.auxtrace       = perf_event__process_auxtrace,
-		.auxtrace_error = perf_event__process_auxtrace_error,
-		.ordered_events	= true,
-		.ordering_requires_timestamps = true,
-	},
-};
-
 static const char * const c2c_usage[] = {
 	"perf c2c {record|report}",
 	NULL
@@ -3070,6 +3052,19 @@ static int perf_c2c__report(int argc, const char **argv)
 	data.path  = input_name;
 	data.force = symbol_conf.force;
 
+	perf_tool__init(&c2c.tool, /*ordered_events=*/true);
+	c2c.tool.sample		= process_sample_event;
+	c2c.tool.mmap		= perf_event__process_mmap;
+	c2c.tool.mmap2		= perf_event__process_mmap2;
+	c2c.tool.comm		= perf_event__process_comm;
+	c2c.tool.exit		= perf_event__process_exit;
+	c2c.tool.fork		= perf_event__process_fork;
+	c2c.tool.lost		= perf_event__process_lost;
+	c2c.tool.attr		= perf_event__process_attr;
+	c2c.tool.auxtrace_info  = perf_event__process_auxtrace_info;
+	c2c.tool.auxtrace       = perf_event__process_auxtrace;
+	c2c.tool.auxtrace_error = perf_event__process_auxtrace_error;
+	c2c.tool.ordering_requires_timestamps = true;
 	session = perf_session__new(&data, &c2c.tool);
 	if (IS_ERR(session)) {
 		err = PTR_ERR(session);
-- 
cgit v1.2.3


From 2fa28ccb17c4bfc31f7752b59e4b90856d7b33b1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:05 -0700
Subject: perf script: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-14-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 65 +++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 31a6f800d55f..54598d1e815a 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -3899,38 +3899,7 @@ int cmd_script(int argc, const char **argv)
 	const char *dlfilter_file = NULL;
 	const char **__argv;
 	int i, j, err = 0;
-	struct perf_script script = {
-		.tool = {
-			.sample		 = process_sample_event,
-			.mmap		 = perf_event__process_mmap,
-			.mmap2		 = perf_event__process_mmap2,
-			.comm		 = perf_event__process_comm,
-			.namespaces	 = perf_event__process_namespaces,
-			.cgroup		 = perf_event__process_cgroup,
-			.exit		 = perf_event__process_exit,
-			.fork		 = perf_event__process_fork,
-			.attr		 = process_attr,
-			.event_update   = perf_event__process_event_update,
-#ifdef HAVE_LIBTRACEEVENT
-			.tracing_data	 = perf_event__process_tracing_data,
-#endif
-			.feature	 = process_feature_event,
-			.build_id	 = perf_event__process_build_id,
-			.id_index	 = perf_event__process_id_index,
-			.auxtrace_info	 = perf_script__process_auxtrace_info,
-			.auxtrace	 = perf_event__process_auxtrace,
-			.auxtrace_error	 = perf_event__process_auxtrace_error,
-			.stat		 = perf_event__process_stat_event,
-			.stat_round	 = process_stat_round_event,
-			.stat_config	 = process_stat_config_event,
-			.thread_map	 = process_thread_map_event,
-			.cpu_map	 = process_cpu_map_event,
-			.throttle	 = process_throttle_event,
-			.unthrottle	 = process_throttle_event,
-			.ordered_events	 = true,
-			.ordering_requires_timestamps = true,
-		},
-	};
+	struct perf_script script = {};
 	struct perf_data data = {
 		.mode = PERF_DATA_MODE_READ,
 	};
@@ -4104,10 +4073,8 @@ int cmd_script(int argc, const char **argv)
 	data.path  = input_name;
 	data.force = symbol_conf.force;
 
-	if (unsorted_dump) {
+	if (unsorted_dump)
 		dump_trace = true;
-		script.tool.ordered_events = false;
-	}
 
 	if (symbol__validate_sym_arguments())
 		return -1;
@@ -4298,6 +4265,34 @@ script_found:
 		use_browser = 0;
 	}
 
+	perf_tool__init(&script.tool, !unsorted_dump);
+	script.tool.sample		 = process_sample_event;
+	script.tool.mmap		 = perf_event__process_mmap;
+	script.tool.mmap2		 = perf_event__process_mmap2;
+	script.tool.comm		 = perf_event__process_comm;
+	script.tool.namespaces		 = perf_event__process_namespaces;
+	script.tool.cgroup		 = perf_event__process_cgroup;
+	script.tool.exit		 = perf_event__process_exit;
+	script.tool.fork		 = perf_event__process_fork;
+	script.tool.attr		 = process_attr;
+	script.tool.event_update	 = perf_event__process_event_update;
+#ifdef HAVE_LIBTRACEEVENT
+	script.tool.tracing_data	 = perf_event__process_tracing_data;
+#endif
+	script.tool.feature		 = process_feature_event;
+	script.tool.build_id		 = perf_event__process_build_id;
+	script.tool.id_index		 = perf_event__process_id_index;
+	script.tool.auxtrace_info	 = perf_script__process_auxtrace_info;
+	script.tool.auxtrace		 = perf_event__process_auxtrace;
+	script.tool.auxtrace_error	 = perf_event__process_auxtrace_error;
+	script.tool.stat		 = perf_event__process_stat_event;
+	script.tool.stat_round		 = process_stat_round_event;
+	script.tool.stat_config		 = process_stat_config_event;
+	script.tool.thread_map		 = process_thread_map_event;
+	script.tool.cpu_map		 = process_cpu_map_event;
+	script.tool.throttle		 = process_throttle_event;
+	script.tool.unthrottle		 = process_throttle_event;
+	script.tool.ordering_requires_timestamps = true;
 	session = perf_session__new(&data, &script.tool);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
-- 
cgit v1.2.3


From a37c0436f3799d02f13781d5e1c836b956b8c05d Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:06 -0700
Subject: perf inject: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-15-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 91 +++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 48 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 283429ccd034..ef9cba173dd2 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2165,47 +2165,6 @@ static int __cmd_inject(struct perf_inject *inject)
 int cmd_inject(int argc, const char **argv)
 {
 	struct perf_inject inject = {
-		.tool = {
-			.sample		= perf_event__repipe_sample,
-			.read		= perf_event__repipe_sample,
-			.mmap		= perf_event__repipe,
-			.mmap2		= perf_event__repipe,
-			.comm		= perf_event__repipe,
-			.namespaces	= perf_event__repipe,
-			.cgroup		= perf_event__repipe,
-			.fork		= perf_event__repipe,
-			.exit		= perf_event__repipe,
-			.lost		= perf_event__repipe,
-			.lost_samples	= perf_event__repipe,
-			.aux		= perf_event__repipe,
-			.itrace_start	= perf_event__repipe,
-			.aux_output_hw_id = perf_event__repipe,
-			.context_switch	= perf_event__repipe,
-			.throttle	= perf_event__repipe,
-			.unthrottle	= perf_event__repipe,
-			.ksymbol	= perf_event__repipe,
-			.bpf		= perf_event__repipe,
-			.text_poke	= perf_event__repipe,
-			.attr		= perf_event__repipe_attr,
-			.event_update	= perf_event__repipe_event_update,
-			.tracing_data	= perf_event__repipe_op2_synth,
-			.finished_round	= perf_event__repipe_oe_synth,
-			.build_id	= perf_event__repipe_op2_synth,
-			.id_index	= perf_event__repipe_op2_synth,
-			.auxtrace_info	= perf_event__repipe_op2_synth,
-			.auxtrace_error	= perf_event__repipe_op2_synth,
-			.time_conv	= perf_event__repipe_op2_synth,
-			.thread_map	= perf_event__repipe_op2_synth,
-			.cpu_map	= perf_event__repipe_op2_synth,
-			.stat_config	= perf_event__repipe_op2_synth,
-			.stat		= perf_event__repipe_op2_synth,
-			.stat_round	= perf_event__repipe_op2_synth,
-			.feature	= perf_event__repipe_op2_synth,
-			.finished_init	= perf_event__repipe_op2_synth,
-			.compressed	= perf_event__repipe_op4_synth,
-			.auxtrace	= perf_event__repipe_auxtrace,
-			.dont_split_sample_group = true,
-		},
 		.input_name  = "-",
 		.samples = LIST_HEAD_INIT(inject.samples),
 		.output = {
@@ -2270,6 +2229,7 @@ int cmd_inject(int argc, const char **argv)
 		"perf inject [<options>]",
 		NULL
 	};
+	bool ordered_events;
 
 	if (!inject.itrace_synth_opts.set) {
 		/* Disable eager loading of kernel symbols that adds overhead to perf inject. */
@@ -2334,7 +2294,48 @@ int cmd_inject(int argc, const char **argv)
 		if (strcmp(inject.input_name, "-"))
 			repipe = false;
 	}
-
+	ordered_events = inject.jit_mode || inject.sched_stat ||
+		(inject.build_ids && !inject.build_id_all);
+	perf_tool__init(&inject.tool, ordered_events);
+	inject.tool.sample		= perf_event__repipe_sample;
+	inject.tool.read		= perf_event__repipe_sample;
+	inject.tool.mmap		= perf_event__repipe;
+	inject.tool.mmap2		= perf_event__repipe;
+	inject.tool.comm		= perf_event__repipe;
+	inject.tool.namespaces		= perf_event__repipe;
+	inject.tool.cgroup		= perf_event__repipe;
+	inject.tool.fork		= perf_event__repipe;
+	inject.tool.exit		= perf_event__repipe;
+	inject.tool.lost		= perf_event__repipe;
+	inject.tool.lost_samples	= perf_event__repipe;
+	inject.tool.aux			= perf_event__repipe;
+	inject.tool.itrace_start	= perf_event__repipe;
+	inject.tool.aux_output_hw_id	= perf_event__repipe;
+	inject.tool.context_switch	= perf_event__repipe;
+	inject.tool.throttle		= perf_event__repipe;
+	inject.tool.unthrottle		= perf_event__repipe;
+	inject.tool.ksymbol		= perf_event__repipe;
+	inject.tool.bpf			= perf_event__repipe;
+	inject.tool.text_poke		= perf_event__repipe;
+	inject.tool.attr		= perf_event__repipe_attr;
+	inject.tool.event_update	= perf_event__repipe_event_update;
+	inject.tool.tracing_data	= perf_event__repipe_op2_synth;
+	inject.tool.finished_round	= perf_event__repipe_oe_synth;
+	inject.tool.build_id		= perf_event__repipe_op2_synth;
+	inject.tool.id_index		= perf_event__repipe_op2_synth;
+	inject.tool.auxtrace_info	= perf_event__repipe_op2_synth;
+	inject.tool.auxtrace_error	= perf_event__repipe_op2_synth;
+	inject.tool.time_conv		= perf_event__repipe_op2_synth;
+	inject.tool.thread_map		= perf_event__repipe_op2_synth;
+	inject.tool.cpu_map		= perf_event__repipe_op2_synth;
+	inject.tool.stat_config		= perf_event__repipe_op2_synth;
+	inject.tool.stat		= perf_event__repipe_op2_synth;
+	inject.tool.stat_round		= perf_event__repipe_op2_synth;
+	inject.tool.feature		= perf_event__repipe_op2_synth;
+	inject.tool.finished_init	= perf_event__repipe_op2_synth;
+	inject.tool.compressed		= perf_event__repipe_op4_synth;
+	inject.tool.auxtrace		= perf_event__repipe_auxtrace;
+	inject.tool.dont_split_sample_group = true;
 	inject.session = __perf_session__new(&data, repipe,
 					     output_fd(&inject),
 					     &inject.tool);
@@ -2373,7 +2374,6 @@ int cmd_inject(int argc, const char **argv)
 		 * mmaps. We cannot generate the buildid hit list and
 		 * inject the jit mmaps at the same time for now.
 		 */
-		inject.tool.ordered_events = true;
 		inject.tool.ordering_requires_timestamps = true;
 		if (known_build_ids != NULL) {
 			inject.known_build_ids =
@@ -2386,15 +2386,10 @@ int cmd_inject(int argc, const char **argv)
 		}
 	}
 
-	if (inject.sched_stat) {
-		inject.tool.ordered_events = true;
-	}
-
 #ifdef HAVE_JITDUMP
 	if (inject.jit_mode) {
 		inject.tool.mmap2	   = perf_event__jit_repipe_mmap2;
 		inject.tool.mmap	   = perf_event__jit_repipe_mmap;
-		inject.tool.ordered_events = true;
 		inject.tool.ordering_requires_timestamps = true;
 		/*
 		 * JIT MMAP injection injects all MMAP events in one go, so it
-- 
cgit v1.2.3


From 113f614c6dd0c2fb13f4c6d7f3c4a82212de0c0f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:07 -0700
Subject: perf report: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-16-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 55 ++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 5f609a7791ea..dfb47fa85e5c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -799,7 +799,7 @@ static int process_attr(const struct perf_tool *tool __maybe_unused,
 
 static void stats_setup(struct report *rep)
 {
-	memset(&rep->tool, 0, sizeof(rep->tool));
+	perf_tool__init(&rep->tool, /*ordered_events=*/false);
 	rep->tool.attr = process_attr;
 	rep->tool.sample = count_sample_event;
 	rep->tool.lost_samples = count_lost_samples_event;
@@ -817,8 +817,7 @@ static int stats_print(struct report *rep)
 
 static void tasks_setup(struct report *rep)
 {
-	memset(&rep->tool, 0, sizeof(rep->tool));
-	rep->tool.ordered_events = true;
+	perf_tool__init(&rep->tool, /*ordered_events=*/true);
 	if (rep->mmaps_mode) {
 		rep->tool.mmap = perf_event__process_mmap;
 		rep->tool.mmap2 = perf_event__process_mmap2;
@@ -1272,30 +1271,6 @@ int cmd_report(int argc, const char **argv)
 		NULL
 	};
 	struct report report = {
-		.tool = {
-			.sample		 = process_sample_event,
-			.mmap		 = perf_event__process_mmap,
-			.mmap2		 = perf_event__process_mmap2,
-			.comm		 = perf_event__process_comm,
-			.namespaces	 = perf_event__process_namespaces,
-			.cgroup		 = perf_event__process_cgroup,
-			.exit		 = perf_event__process_exit,
-			.fork		 = perf_event__process_fork,
-			.lost		 = perf_event__process_lost,
-			.read		 = process_read_event,
-			.attr		 = process_attr,
-#ifdef HAVE_LIBTRACEEVENT
-			.tracing_data	 = perf_event__process_tracing_data,
-#endif
-			.build_id	 = perf_event__process_build_id,
-			.id_index	 = perf_event__process_id_index,
-			.auxtrace_info	 = perf_event__process_auxtrace_info,
-			.auxtrace	 = perf_event__process_auxtrace,
-			.event_update	 = perf_event__process_event_update,
-			.feature	 = process_feature_event,
-			.ordered_events	 = true,
-			.ordering_requires_timestamps = true,
-		},
 		.max_stack		 = PERF_MAX_STACK_DEPTH,
 		.pretty_printing_style	 = "normal",
 		.socket_filter		 = -1,
@@ -1477,6 +1452,7 @@ int cmd_report(int argc, const char **argv)
 	};
 	int ret = hists__init();
 	char sort_tmp[128];
+	bool ordered_events = true;
 
 	if (ret < 0)
 		goto exit;
@@ -1531,7 +1507,7 @@ int cmd_report(int argc, const char **argv)
 		report.tasks_mode = true;
 
 	if (dump_trace && report.disable_order)
-		report.tool.ordered_events = false;
+		ordered_events = false;
 
 	if (quiet)
 		perf_quiet_option();
@@ -1562,6 +1538,29 @@ int cmd_report(int argc, const char **argv)
 	symbol_conf.skip_empty = report.skip_empty;
 
 repeat:
+	perf_tool__init(&report.tool, ordered_events);
+	report.tool.sample		 = process_sample_event;
+	report.tool.mmap		 = perf_event__process_mmap;
+	report.tool.mmap2		 = perf_event__process_mmap2;
+	report.tool.comm		 = perf_event__process_comm;
+	report.tool.namespaces		 = perf_event__process_namespaces;
+	report.tool.cgroup		 = perf_event__process_cgroup;
+	report.tool.exit		 = perf_event__process_exit;
+	report.tool.fork		 = perf_event__process_fork;
+	report.tool.lost		 = perf_event__process_lost;
+	report.tool.read		 = process_read_event;
+	report.tool.attr		 = process_attr;
+#ifdef HAVE_LIBTRACEEVENT
+	report.tool.tracing_data	 = perf_event__process_tracing_data;
+#endif
+	report.tool.build_id		 = perf_event__process_build_id;
+	report.tool.id_index		 = perf_event__process_id_index;
+	report.tool.auxtrace_info	 = perf_event__process_auxtrace_info;
+	report.tool.auxtrace		 = perf_event__process_auxtrace;
+	report.tool.event_update	 = perf_event__process_event_update;
+	report.tool.feature		 = process_feature_event;
+	report.tool.ordering_requires_timestamps = true;
+
 	session = perf_session__new(&data, &report.tool);
 	if (IS_ERR(session)) {
 		ret = PTR_ERR(session);
-- 
cgit v1.2.3


From 071b117e755bd5000ea13f488e8d19ed9e42121c Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:08 -0700
Subject: perf stat: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-17-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index d44e708f8a44..487eff9a8582 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -2271,15 +2271,6 @@ static const char * const stat_report_usage[] = {
 };
 
 static struct perf_stat perf_stat = {
-	.tool = {
-		.attr		= perf_event__process_attr,
-		.event_update	= perf_event__process_event_update,
-		.thread_map	= process_thread_map_event,
-		.cpu_map	= process_cpu_map_event,
-		.stat_config	= process_stat_config_event,
-		.stat		= perf_event__process_stat_event,
-		.stat_round	= process_stat_round_event,
-	},
 	.aggr_mode	= AGGR_UNSET,
 	.aggr_level	= 0,
 };
@@ -2322,6 +2313,15 @@ static int __cmd_report(int argc, const char **argv)
 	perf_stat.data.path = input_name;
 	perf_stat.data.mode = PERF_DATA_MODE_READ;
 
+	perf_tool__init(&perf_stat.tool, /*ordered_events=*/false);
+	perf_stat.tool.attr		= perf_event__process_attr;
+	perf_stat.tool.event_update	= perf_event__process_event_update;
+	perf_stat.tool.thread_map	= process_thread_map_event;
+	perf_stat.tool.cpu_map		= process_cpu_map_event;
+	perf_stat.tool.stat_config	= process_stat_config_event;
+	perf_stat.tool.stat		= perf_event__process_stat_event;
+	perf_stat.tool.stat_round	= process_stat_round_event;
+
 	session = perf_session__new(&perf_stat.data, &perf_stat.tool);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
-- 
cgit v1.2.3


From d48940cabcf70c186901f2c75fe89fed12d4ec18 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:09 -0700
Subject: perf annotate: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-18-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index cc65e6f8f4da..d6f6ba5a569d 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -696,28 +696,7 @@ static const char * const annotate_usage[] = {
 
 int cmd_annotate(int argc, const char **argv)
 {
-	struct perf_annotate annotate = {
-		.tool = {
-			.sample	= process_sample_event,
-			.mmap	= perf_event__process_mmap,
-			.mmap2	= perf_event__process_mmap2,
-			.comm	= perf_event__process_comm,
-			.exit	= perf_event__process_exit,
-			.fork	= perf_event__process_fork,
-			.namespaces = perf_event__process_namespaces,
-			.attr	= perf_event__process_attr,
-			.build_id = perf_event__process_build_id,
-#ifdef HAVE_LIBTRACEEVENT
-			.tracing_data   = perf_event__process_tracing_data,
-#endif
-			.id_index	= perf_event__process_id_index,
-			.auxtrace_info	= perf_event__process_auxtrace_info,
-			.auxtrace	= perf_event__process_auxtrace,
-			.feature	= process_feature_event,
-			.ordered_events = true,
-			.ordering_requires_timestamps = true,
-		},
-	};
+	struct perf_annotate annotate = {};
 	struct perf_data data = {
 		.mode  = PERF_DATA_MODE_READ,
 	};
@@ -876,6 +855,25 @@ int cmd_annotate(int argc, const char **argv)
 
 	data.path = input_name;
 
+	perf_tool__init(&annotate.tool, /*ordered_events=*/true);
+	annotate.tool.sample	= process_sample_event;
+	annotate.tool.mmap	= perf_event__process_mmap;
+	annotate.tool.mmap2	= perf_event__process_mmap2;
+	annotate.tool.comm	= perf_event__process_comm;
+	annotate.tool.exit	= perf_event__process_exit;
+	annotate.tool.fork	= perf_event__process_fork;
+	annotate.tool.namespaces = perf_event__process_namespaces;
+	annotate.tool.attr	= perf_event__process_attr;
+	annotate.tool.build_id = perf_event__process_build_id;
+#ifdef HAVE_LIBTRACEEVENT
+	annotate.tool.tracing_data   = perf_event__process_tracing_data;
+#endif
+	annotate.tool.id_index	= perf_event__process_id_index;
+	annotate.tool.auxtrace_info	= perf_event__process_auxtrace_info;
+	annotate.tool.auxtrace	= perf_event__process_auxtrace;
+	annotate.tool.feature	= process_feature_event;
+	annotate.tool.ordering_requires_timestamps = true;
+
 	annotate.session = perf_session__new(&data, &annotate.tool);
 	if (IS_ERR(annotate.session))
 		return PTR_ERR(annotate.session);
-- 
cgit v1.2.3


From 41860d49473c0c09dc0a2a4d148047f97aaa2539 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:10 -0700
Subject: perf sched: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-19-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 2c60bd3a8149..34fe8e540c43 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -3100,7 +3100,6 @@ static int perf_sched__timehist(struct perf_sched *sched)
 	sched->tool.tracing_data = perf_event__process_tracing_data;
 	sched->tool.build_id	 = perf_event__process_build_id;
 
-	sched->tool.ordered_events = true;
 	sched->tool.ordering_requires_timestamps = true;
 
 	symbol_conf.use_callchain = sched->show_callchain;
@@ -3605,14 +3604,6 @@ int cmd_sched(int argc, const char **argv)
 {
 	static const char default_sort_order[] = "avg, max, switch, runtime";
 	struct perf_sched sched = {
-		.tool = {
-			.sample		 = perf_sched__process_tracepoint_sample,
-			.comm		 = perf_sched__process_comm,
-			.namespaces	 = perf_event__process_namespaces,
-			.lost		 = perf_event__process_lost,
-			.fork		 = perf_sched__process_fork_event,
-			.ordered_events = true,
-		},
 		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
 		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
 		.sort_order	      = default_sort_order,
@@ -3733,6 +3724,13 @@ int cmd_sched(int argc, const char **argv)
 	};
 	int ret;
 
+	perf_tool__init(&sched.tool, /*ordered_events=*/true);
+	sched.tool.sample	 = perf_sched__process_tracepoint_sample;
+	sched.tool.comm		 = perf_sched__process_comm;
+	sched.tool.namespaces	 = perf_event__process_namespaces;
+	sched.tool.lost		 = perf_event__process_lost;
+	sched.tool.fork		 = perf_sched__process_fork_event;
+
 	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
 					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 	if (!argc)
-- 
cgit v1.2.3


From 4a20562bc4118478b4592bab8f0d90975df77095 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:11 -0700
Subject: perf mem: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-20-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-mem.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 0fb4d75ab959..18e5f9a0ddc2 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -276,7 +276,23 @@ static int report_raw_events(struct perf_mem *mem)
 		.force = mem->force,
 	};
 	int ret;
-	struct perf_session *session = perf_session__new(&data, &mem->tool);
+	struct perf_session *session;
+
+	perf_tool__init(&mem->tool, /*ordered_events=*/true);
+	mem->tool.sample		= process_sample_event;
+	mem->tool.mmap		= perf_event__process_mmap;
+	mem->tool.mmap2		= perf_event__process_mmap2;
+	mem->tool.comm		= perf_event__process_comm;
+	mem->tool.lost		= perf_event__process_lost;
+	mem->tool.fork		= perf_event__process_fork;
+	mem->tool.attr		= perf_event__process_attr;
+	mem->tool.build_id	= perf_event__process_build_id;
+	mem->tool.namespaces	= perf_event__process_namespaces;
+	mem->tool.auxtrace_info  = perf_event__process_auxtrace_info;
+	mem->tool.auxtrace       = perf_event__process_auxtrace;
+	mem->tool.auxtrace_error = perf_event__process_auxtrace_error;
+
+	session = perf_session__new(&data, &mem->tool);
 
 	if (IS_ERR(session))
 		return PTR_ERR(session);
@@ -458,21 +474,6 @@ int cmd_mem(int argc, const char **argv)
 {
 	struct stat st;
 	struct perf_mem mem = {
-		.tool = {
-			.sample		= process_sample_event,
-			.mmap		= perf_event__process_mmap,
-			.mmap2		= perf_event__process_mmap2,
-			.comm		= perf_event__process_comm,
-			.lost		= perf_event__process_lost,
-			.fork		= perf_event__process_fork,
-			.attr		= perf_event__process_attr,
-			.build_id	= perf_event__process_build_id,
-			.namespaces	= perf_event__process_namespaces,
-			.auxtrace_info  = perf_event__process_auxtrace_info,
-			.auxtrace       = perf_event__process_auxtrace,
-			.auxtrace_error = perf_event__process_auxtrace_error,
-			.ordered_events	= true,
-		},
 		.input_name		 = "perf.data",
 		/*
 		 * default to both load an store sampling
-- 
cgit v1.2.3


From 60b5fd3f62d0622b70450d4707dc30be47a724c7 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:12 -0700
Subject: perf timechart: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-21-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-timechart.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 5bf818baa662..218c8b44d7be 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -1606,10 +1606,16 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name)
 		.mode  = PERF_DATA_MODE_READ,
 		.force = tchart->force,
 	};
-
-	struct perf_session *session = perf_session__new(&data, &tchart->tool);
+	struct perf_session *session;
 	int ret = -EINVAL;
 
+	perf_tool__init(&tchart->tool, /*ordered_events=*/true);
+	tchart->tool.comm		 = process_comm_event;
+	tchart->tool.fork		 = process_fork_event;
+	tchart->tool.exit		 = process_exit_event;
+	tchart->tool.sample		 = process_sample_event;
+
+	session = perf_session__new(&data, &tchart->tool);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
 
@@ -1924,13 +1930,6 @@ parse_time(const struct option *opt, const char *arg, int __maybe_unused unset)
 int cmd_timechart(int argc, const char **argv)
 {
 	struct timechart tchart = {
-		.tool = {
-			.comm		 = process_comm_event,
-			.fork		 = process_fork_event,
-			.exit		 = process_exit_event,
-			.sample		 = process_sample_event,
-			.ordered_events	 = true,
-		},
 		.proc_num = 15,
 		.min_time = NSEC_PER_MSEC,
 		.merge_dist = 1000,
-- 
cgit v1.2.3


From 1e1ec8f2e5fa914da0155170c63099d7189f3cfc Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:13 -0700
Subject: perf diff: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-22-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-diff.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 4c0567882a7a..28c5208fcdc9 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -467,21 +467,7 @@ out:
 	return ret;
 }
 
-static struct perf_diff pdiff = {
-	.tool = {
-		.sample	= diff__process_sample_event,
-		.mmap	= perf_event__process_mmap,
-		.mmap2	= perf_event__process_mmap2,
-		.comm	= perf_event__process_comm,
-		.exit	= perf_event__process_exit,
-		.fork	= perf_event__process_fork,
-		.lost	= perf_event__process_lost,
-		.namespaces = perf_event__process_namespaces,
-		.cgroup = perf_event__process_cgroup,
-		.ordered_events = true,
-		.ordering_requires_timestamps = true,
-	},
-};
+static struct perf_diff pdiff;
 
 static struct evsel *evsel_match(struct evsel *evsel,
 				      struct evlist *evlist)
@@ -1959,6 +1945,18 @@ int cmd_diff(int argc, const char **argv)
 	if (ret < 0)
 		return ret;
 
+	perf_tool__init(&pdiff.tool, /*ordered_events=*/true);
+	pdiff.tool.sample	= diff__process_sample_event;
+	pdiff.tool.mmap	= perf_event__process_mmap;
+	pdiff.tool.mmap2	= perf_event__process_mmap2;
+	pdiff.tool.comm	= perf_event__process_comm;
+	pdiff.tool.exit	= perf_event__process_exit;
+	pdiff.tool.fork	= perf_event__process_fork;
+	pdiff.tool.lost	= perf_event__process_lost;
+	pdiff.tool.namespaces = perf_event__process_namespaces;
+	pdiff.tool.cgroup = perf_event__process_cgroup;
+	pdiff.tool.ordering_requires_timestamps = true;
+
 	perf_config(diff__config, NULL);
 
 	argc = parse_options(argc, argv, options, diff_usage, 0);
-- 
cgit v1.2.3


From b9d276d1a282dfca186d0dbc2a001581b7c6f5b7 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:14 -0700
Subject: perf data convert json: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-23-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/data-convert-json.c | 43 +++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 905ea9823f9d..20bfb0884e9e 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -316,39 +316,36 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	struct perf_session *session;
 	int fd;
 	int ret = -1;
-
 	struct convert_json c = {
-		.tool = {
-			.sample         = process_sample_event,
-			.mmap           = perf_event__process_mmap,
-			.mmap2          = perf_event__process_mmap2,
-			.comm           = perf_event__process_comm,
-			.namespaces     = perf_event__process_namespaces,
-			.cgroup         = perf_event__process_cgroup,
-			.exit           = perf_event__process_exit,
-			.fork           = perf_event__process_fork,
-			.lost           = perf_event__process_lost,
-#ifdef HAVE_LIBTRACEEVENT
-			.tracing_data   = perf_event__process_tracing_data,
-#endif
-			.build_id       = perf_event__process_build_id,
-			.id_index       = perf_event__process_id_index,
-			.auxtrace_info  = perf_event__process_auxtrace_info,
-			.auxtrace       = perf_event__process_auxtrace,
-			.event_update   = perf_event__process_event_update,
-			.ordered_events = true,
-			.ordering_requires_timestamps = true,
-		},
 		.first = true,
 		.events_count = 0,
 	};
-
 	struct perf_data data = {
 		.mode = PERF_DATA_MODE_READ,
 		.path = input_name,
 		.force = opts->force,
 	};
 
+	perf_tool__init(&c.tool, /*ordered_events=*/true);
+	c.tool.sample         = process_sample_event;
+	c.tool.mmap           = perf_event__process_mmap;
+	c.tool.mmap2          = perf_event__process_mmap2;
+	c.tool.comm           = perf_event__process_comm;
+	c.tool.namespaces     = perf_event__process_namespaces;
+	c.tool.cgroup         = perf_event__process_cgroup;
+	c.tool.exit           = perf_event__process_exit;
+	c.tool.fork           = perf_event__process_fork;
+	c.tool.lost           = perf_event__process_lost;
+#ifdef HAVE_LIBTRACEEVENT
+	c.tool.tracing_data   = perf_event__process_tracing_data;
+#endif
+	c.tool.build_id       = perf_event__process_build_id;
+	c.tool.id_index       = perf_event__process_id_index;
+	c.tool.auxtrace_info  = perf_event__process_auxtrace_info;
+	c.tool.auxtrace       = perf_event__process_auxtrace;
+	c.tool.event_update   = perf_event__process_event_update;
+	c.tool.ordering_requires_timestamps = true;
+
 	if (opts->all) {
 		pr_err("--all is currently unsupported for JSON output.\n");
 		goto err;
-- 
cgit v1.2.3


From 2721c6cc04d79057b855411a16964965d5a3da0a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:15 -0700
Subject: perf data convert ctf: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-24-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/data-convert-bt.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 9e2170604b66..021e9b1d5cc5 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1607,25 +1607,23 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 		.mode      = PERF_DATA_MODE_READ,
 		.force     = opts->force,
 	};
-	struct convert c = {
-		.tool = {
-			.sample          = process_sample_event,
-			.mmap            = perf_event__process_mmap,
-			.mmap2           = perf_event__process_mmap2,
-			.comm            = perf_event__process_comm,
-			.exit            = perf_event__process_exit,
-			.fork            = perf_event__process_fork,
-			.lost            = perf_event__process_lost,
-			.tracing_data    = perf_event__process_tracing_data,
-			.build_id        = perf_event__process_build_id,
-			.namespaces      = perf_event__process_namespaces,
-			.ordered_events  = true,
-			.ordering_requires_timestamps = true,
-		},
-	};
+	struct convert c = {};
 	struct ctf_writer *cw = &c.writer;
 	int err;
 
+	perf_tool__init(&c.tool, /*ordered_events=*/true);
+	c.tool.sample          = process_sample_event;
+	c.tool.mmap            = perf_event__process_mmap;
+	c.tool.mmap2           = perf_event__process_mmap2;
+	c.tool.comm            = perf_event__process_comm;
+	c.tool.exit            = perf_event__process_exit;
+	c.tool.fork            = perf_event__process_fork;
+	c.tool.lost            = perf_event__process_lost;
+	c.tool.tracing_data    = perf_event__process_tracing_data;
+	c.tool.build_id        = perf_event__process_build_id;
+	c.tool.namespaces      = perf_event__process_namespaces;
+	c.tool.ordering_requires_timestamps = true;
+
 	if (opts->all) {
 		c.tool.comm = process_comm_event;
 		c.tool.exit = process_exit_event;
-- 
cgit v1.2.3


From 332b897f34b95fea32c998ceca0569ac2c049d3d Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:16 -0700
Subject: perf test event_update: Ensure tools is initialized

Ensure tool is initialized to avoid lazy initialization pattern so
that more uses of struct perf_tool can be made const.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-25-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/event_update.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index fdecad920f59..d6b4ce3ef4ee 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -103,6 +103,7 @@ static int test__event_update(struct test_suite *test __maybe_unused, int subtes
 	TEST_ASSERT_VAL("failed to synthesize attr update scale",
 			!perf_event__synthesize_event_update_scale(NULL, evsel, process_event_scale));
 
+	perf_tool__init(&tmp.tool, /*ordered_events=*/false);
 	tmp.name = evsel__name(evsel);
 
 	TEST_ASSERT_VAL("failed to synthesize attr update name",
-- 
cgit v1.2.3


From fcd00f3e3b375ee87caf5be587840441ef1b4243 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:17 -0700
Subject: perf kwork: Use perf_tool__init()

Use perf_tool__init() so that more uses of 'struct perf_tool' can be const
and not relying on perf_tool__fill_defaults().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-26-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kwork.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c
index 8ffaa80a2d1d..6a4281b8fd10 100644
--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -2322,12 +2322,6 @@ int cmd_kwork(int argc, const char **argv)
 {
 	static struct perf_kwork kwork = {
 		.class_list          = LIST_HEAD_INIT(kwork.class_list),
-		.tool = {
-			.mmap		= perf_event__process_mmap,
-			.mmap2		= perf_event__process_mmap2,
-			.sample		= perf_kwork__process_tracepoint_sample,
-			.ordered_events = true,
-		},
 		.atom_page_list      = LIST_HEAD_INIT(kwork.atom_page_list),
 		.sort_list           = LIST_HEAD_INIT(kwork.sort_list),
 		.cmp_id              = LIST_HEAD_INIT(kwork.cmp_id),
@@ -2462,6 +2456,11 @@ int cmd_kwork(int argc, const char **argv)
 		"record", "report", "latency", "timehist", "top", NULL
 	};
 
+	perf_tool__init(&kwork.tool, /*ordered_events=*/true);
+	kwork.tool.mmap	  = perf_event__process_mmap;
+	kwork.tool.mmap2  = perf_event__process_mmap2;
+	kwork.tool.sample = perf_kwork__process_tracepoint_sample;
+
 	argc = parse_options_subcommand(argc, argv, kwork_options,
 					kwork_subcommands, kwork_usage,
 					PARSE_OPT_STOP_AT_NON_OPTION);
-- 
cgit v1.2.3


From 15d4a6f41d7275799d318b198f479c9783d56f78 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:18 -0700
Subject: perf tool: Remove perf_tool__fill_defaults()

Now all tools are fully initialized prior to use it has no use so
remove.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-27-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c |  6 ----
 tools/perf/util/tool.c    | 89 -----------------------------------------------
 tools/perf/util/tool.h    |  1 -
 3 files changed, 96 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5c0e0341d39e..a8404cc99cfc 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1781,8 +1781,6 @@ static int __perf_session__process_pipe_events(struct perf_session *session)
 	void *p;
 	bool update_prog = false;
 
-	perf_tool__fill_defaults(tool);
-
 	/*
 	 * If it's from a file saving pipe data (by redirection), it would have
 	 * a file name other than "-".  Then we can get the total size and show
@@ -2206,8 +2204,6 @@ static int __perf_session__process_events(struct perf_session *session)
 	struct ui_progress prog;
 	int err;
 
-	perf_tool__fill_defaults(tool);
-
 	if (rd.data_size == 0)
 		return -1;
 
@@ -2260,8 +2256,6 @@ static int __perf_session__process_dir_events(struct perf_session *session)
 	u64 total_size = perf_data__size(session->data);
 	struct reader *rd;
 
-	perf_tool__fill_defaults(tool);
-
 	ui_progress__init_size(&prog, total_size, "Processing events...");
 
 	nr_readers = 1;
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index a4ca16e5eefe..3b7f390f26eb 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -288,95 +288,6 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
 	tool->finished_init = process_event_op2_stub;
 }
 
-void perf_tool__fill_defaults(struct perf_tool *tool)
-{
-	if (tool->sample == NULL)
-		tool->sample = process_event_sample_stub;
-	if (tool->mmap == NULL)
-		tool->mmap = process_event_stub;
-	if (tool->mmap2 == NULL)
-		tool->mmap2 = process_event_stub;
-	if (tool->comm == NULL)
-		tool->comm = process_event_stub;
-	if (tool->namespaces == NULL)
-		tool->namespaces = process_event_stub;
-	if (tool->cgroup == NULL)
-		tool->cgroup = process_event_stub;
-	if (tool->fork == NULL)
-		tool->fork = process_event_stub;
-	if (tool->exit == NULL)
-		tool->exit = process_event_stub;
-	if (tool->lost == NULL)
-		tool->lost = perf_event__process_lost;
-	if (tool->lost_samples == NULL)
-		tool->lost_samples = perf_event__process_lost_samples;
-	if (tool->aux == NULL)
-		tool->aux = perf_event__process_aux;
-	if (tool->itrace_start == NULL)
-		tool->itrace_start = perf_event__process_itrace_start;
-	if (tool->context_switch == NULL)
-		tool->context_switch = perf_event__process_switch;
-	if (tool->ksymbol == NULL)
-		tool->ksymbol = perf_event__process_ksymbol;
-	if (tool->bpf == NULL)
-		tool->bpf = perf_event__process_bpf;
-	if (tool->text_poke == NULL)
-		tool->text_poke = perf_event__process_text_poke;
-	if (tool->aux_output_hw_id == NULL)
-		tool->aux_output_hw_id = perf_event__process_aux_output_hw_id;
-	if (tool->read == NULL)
-		tool->read = process_event_sample_stub;
-	if (tool->throttle == NULL)
-		tool->throttle = process_event_stub;
-	if (tool->unthrottle == NULL)
-		tool->unthrottle = process_event_stub;
-	if (tool->attr == NULL)
-		tool->attr = process_event_synth_attr_stub;
-	if (tool->event_update == NULL)
-		tool->event_update = process_event_synth_event_update_stub;
-	if (tool->tracing_data == NULL)
-		tool->tracing_data = process_event_synth_tracing_data_stub;
-	if (tool->build_id == NULL)
-		tool->build_id = process_event_op2_stub;
-	if (tool->finished_round == NULL) {
-		if (tool->ordered_events)
-			tool->finished_round = perf_event__process_finished_round;
-		else
-			tool->finished_round = process_finished_round_stub;
-	}
-	if (tool->id_index == NULL)
-		tool->id_index = process_event_op2_stub;
-	if (tool->auxtrace_info == NULL)
-		tool->auxtrace_info = process_event_op2_stub;
-	if (tool->auxtrace == NULL)
-		tool->auxtrace = process_event_auxtrace_stub;
-	if (tool->auxtrace_error == NULL)
-		tool->auxtrace_error = process_event_op2_stub;
-	if (tool->thread_map == NULL)
-		tool->thread_map = process_event_thread_map_stub;
-	if (tool->cpu_map == NULL)
-		tool->cpu_map = process_event_cpu_map_stub;
-	if (tool->stat_config == NULL)
-		tool->stat_config = process_event_stat_config_stub;
-	if (tool->stat == NULL)
-		tool->stat = process_stat_stub;
-	if (tool->stat_round == NULL)
-		tool->stat_round = process_stat_round_stub;
-	if (tool->time_conv == NULL)
-		tool->time_conv = process_event_time_conv_stub;
-	if (tool->feature == NULL)
-		tool->feature = process_event_op2_stub;
-	if (tool->compressed == NULL) {
-#ifdef HAVE_ZSTD_SUPPORT
-		tool->compressed = perf_session__process_compressed_event;
-#else
-		tool->compressed = perf_session__process_compressed_event_stub;
-#endif
-	}
-	if (tool->finished_init == NULL)
-		tool->finished_init = process_event_op2_stub;
-}
-
 bool perf_tool__compressed_is_stub(const struct perf_tool *tool)
 {
 	return tool->compressed == perf_session__process_compressed_event_stub;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index c7fea58729a5..db1c7642b0d1 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -90,7 +90,6 @@ struct perf_tool {
 };
 
 void perf_tool__init(struct perf_tool *tool, bool ordered_events);
-void perf_tool__fill_defaults(struct perf_tool *tool);
 
 bool perf_tool__compressed_is_stub(const struct perf_tool *tool);
 
-- 
cgit v1.2.3


From 8f29be326da202963e3e6d33550d375313009f68 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 13:47:19 -0700
Subject: perf session: Constify tool

Make tool const now that all uses are const and
perf_tool__fill_defaults() won't be used. The aim is to better capture
that sessions don't mutate tools.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240812204720.631678-28-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c | 6 +++---
 tools/perf/util/session.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index a8404cc99cfc..d2bd563119bc 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1770,7 +1770,7 @@ static int __perf_session__process_decomp_events(struct perf_session *session);
 static int __perf_session__process_pipe_events(struct perf_session *session)
 {
 	struct ordered_events *oe = &session->ordered_events;
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct ui_progress prog;
 	union perf_event *event;
 	uint32_t size, cur_size = 0;
@@ -2200,7 +2200,7 @@ static int __perf_session__process_events(struct perf_session *session)
 		.in_place_update = session->data->in_place_update,
 	};
 	struct ordered_events *oe = &session->ordered_events;
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	struct ui_progress prog;
 	int err;
 
@@ -2250,7 +2250,7 @@ out_err:
 static int __perf_session__process_dir_events(struct perf_session *session)
 {
 	struct perf_data *data = session->data;
-	struct perf_tool *tool = session->tool;
+	const struct perf_tool *tool = session->tool;
 	int i, ret, readers, nr_readers;
 	struct ui_progress prog;
 	u64 total_size = perf_data__size(session->data);
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 7f69baeae7fb..7c8dd6956330 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -43,7 +43,7 @@ struct perf_session {
 	u64			one_mmap_offset;
 	struct ordered_events	ordered_events;
 	struct perf_data	*data;
-	struct perf_tool	*tool;
+	const struct perf_tool	*tool;
 	u64			bytes_transferred;
 	u64			bytes_compressed;
 	struct zstd_data	zstd_data;
-- 
cgit v1.2.3


From 807746b9bdc2acc9b18130d0a7ff0be1626ccc76 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 20 Jul 2024 02:20:54 -0400
Subject: perf parse-events: Add a retirement latency modifier

Retirement latency is a separate sampled count used on newer Intel
CPUs.

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samantha Alt <samantha.alt@intel.com>
Link: https://lore.kernel.org/r/20240720062102.444578-2-weilin.wang@intel.com
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.h        | 1 +
 tools/perf/util/parse-events.c | 2 ++
 tools/perf/util/parse-events.h | 1 +
 tools/perf/util/parse-events.l | 3 ++-
 4 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 80b5f6dd868e..14f777b9e03e 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -98,6 +98,7 @@ struct evsel {
 		bool			bpf_counter;
 		bool			use_config_name;
 		bool			skippable;
+		bool			retire_lat;
 		int			bpf_fd;
 		struct bpf_object	*bpf_obj;
 		struct list_head	config_terms;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 321586fb5556..fab01ba54e34 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1811,6 +1811,8 @@ static int parse_events__modifier_list(struct parse_events_state *parse_state,
 			evsel->weak_group = true;
 		if (mod.bpf)
 			evsel->bpf_counter = true;
+		if (mod.retire_lat)
+			evsel->retire_lat = true;
 	}
 	return 0;
 }
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index e13de2c8b706..b735cd9e0acf 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -203,6 +203,7 @@ struct parse_events_modifier {
 	bool hypervisor : 1;	/* 'h' */
 	bool guest : 1;		/* 'G' */
 	bool host : 1;		/* 'H' */
+	bool retire_lat : 1;	/* 'R' */
 };
 
 int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc,
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 16045c383ada..5a0bcd7f166a 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -209,6 +209,7 @@ static int modifiers(struct parse_events_state *parse_state, yyscan_t scanner)
 		CASE('W', weak);
 		CASE('e', exclusive);
 		CASE('b', bpf);
+		CASE('R', retire_lat);
 		default:
 			return PE_ERROR;
 		}
@@ -250,7 +251,7 @@ drv_cfg_term	[a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)?
  * If you add a modifier you need to update check_modifier().
  * Also, the letters in modifier_event must not be in modifier_bp.
  */
-modifier_event	[ukhpPGHSDIWeb]{1,15}
+modifier_event	[ukhpPGHSDIWebR]{1,16}
 modifier_bp	[rwx]{1,3}
 lc_type 	(L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node)
 lc_op_result	(load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss)
-- 
cgit v1.2.3


From a9a4ca5767c1756b1bc5bd3397b515af2cbc2c82 Mon Sep 17 00:00:00 2001
From: Weilin Wang <weilin.wang@intel.com>
Date: Sat, 20 Jul 2024 02:20:55 -0400
Subject: perf data: Allow to use given fd in data->file.fd

When in PIPE mode, allow to use fd dynamically opened and asigned to
data->file.fd instead of STDIN_FILENO or STDOUT_FILENO.

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samantha Alt <samantha.alt@intel.com>
Link: https://lore.kernel.org/r/20240720062102.444578-3-weilin.wang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/data.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 08c4bfbd817f..98661ede2a73 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -204,7 +204,12 @@ static bool check_pipe(struct perf_data *data)
 				data->file.fd = fd;
 				data->use_stdio = false;
 			}
-		} else {
+
+		/*
+		 * When is_pipe and data->file.fd is given, use given fd
+		 * instead of STDIN_FILENO or STDOUT_FILENO
+		 */
+		} else if (data->file.fd <= 0) {
 			data->file.fd = fd;
 		}
 	}
-- 
cgit v1.2.3


From 110d3ffe9d2b42c04faaf92c3fc627004e3328c6 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 7 Aug 2024 16:13:48 +0200
Subject: selftests: router_mpath: Sleep after MZ

In the context of an offloaded datapath, it may take a while for the ip
link stats to be updated. This causes the test to fail when MZ_DELAY is too
low. Sleep after the packets are sent for the link stats to get up to date.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/8b1971d948273afd7de2da3d6a2ba35200540e55.1723036486.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/router_mpath_nh.sh     | 2 ++
 tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh | 2 ++
 tools/testing/selftests/net/forwarding/router_multipath.sh    | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
index 2ba44247c60a..c5a30f8f55b5 100755
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
@@ -243,6 +243,7 @@ multipath4_test()
 
 	ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
 		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
 	t1_rp13=$(link_stats_tx_packets_get $rp13)
@@ -276,6 +277,7 @@ multipath6_test()
 
 	$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
 		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
 	t1_rp13=$(link_stats_tx_packets_get $rp13)
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
index cd9e346436fc..bd35fe8be9aa 100755
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
@@ -244,6 +244,7 @@ multipath4_test()
 
 	ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
 		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
 	t1_rp13=$(link_stats_tx_packets_get $rp13)
@@ -274,6 +275,7 @@ multipath6_l4_test()
 
 	$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
 		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
 	t1_rp13=$(link_stats_tx_packets_get $rp13)
diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh
index e2be354167a1..46f365b557b7 100755
--- a/tools/testing/selftests/net/forwarding/router_multipath.sh
+++ b/tools/testing/selftests/net/forwarding/router_multipath.sh
@@ -180,6 +180,7 @@ multipath4_test()
 
        ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
 	       -d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+       sleep 1
 
        t1_rp12=$(link_stats_tx_packets_get $rp12)
        t1_rp13=$(link_stats_tx_packets_get $rp13)
@@ -217,6 +218,7 @@ multipath6_test()
 
        $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
 	       -d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+       sleep 1
 
        t1_rp12=$(link_stats_tx_packets_get $rp12)
        t1_rp13=$(link_stats_tx_packets_get $rp13)
-- 
cgit v1.2.3


From bb89fdacf99ccbec4dfe2d04f76f870f1483d3ea Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 7 Aug 2024 16:13:49 +0200
Subject: selftests: router_mpath_nh: Test 16-bit next hop weights

Add tests that exercise full 16 bits of NH weight.

To test the 255:65535, it is necessary to run more packets than for the
other tests. On a debug kernel, the test can take up to a minute, therefore
avoid the test when KSFT_MACHINE_SLOW.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/c0c257c00ad30b07afc3fa5e2afd135925405544.1723036486.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/lib.sh      |  7 ++++
 .../selftests/net/forwarding/router_mpath_nh.sh    | 38 +++++++++++++++++-----
 .../net/forwarding/router_mpath_nh_lib.sh          | 13 ++++++++
 3 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index ff96bb7535ff..cb0fcd6f0293 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -509,6 +509,13 @@ xfail_on_slow()
 	fi
 }
 
+omit_on_slow()
+{
+	if [[ $KSFT_MACHINE_SLOW != yes ]]; then
+		"$@"
+	fi
+}
+
 xfail_on_veth()
 {
 	local dev=$1; shift
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
index c5a30f8f55b5..a7d8399c8d4f 100755
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
@@ -40,6 +40,7 @@ ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
 	multipath_test
+	multipath16_test
 	ping_ipv4_blackhole
 	ping_ipv6_blackhole
 	nh_stats_test_v4
@@ -226,9 +227,11 @@ routing_nh_obj()
 
 multipath4_test()
 {
-	local desc="$1"
-	local weight_rp12=$2
-	local weight_rp13=$3
+	local desc=$1; shift
+	local weight_rp12=$1; shift
+	local weight_rp13=$1; shift
+	local ports=${1-sp=1024,dp=0-32768}; shift
+
 	local t0_rp12 t0_rp13 t1_rp12 t1_rp13
 	local packets_rp12 packets_rp13
 
@@ -242,7 +245,7 @@ multipath4_test()
 	t0_rp13=$(link_stats_tx_packets_get $rp13)
 
 	ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
-		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+		-d $MZ_DELAY -t udp "$ports"
 	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
@@ -259,9 +262,11 @@ multipath4_test()
 
 multipath6_test()
 {
-	local desc="$1"
-	local weight_rp12=$2
-	local weight_rp13=$3
+	local desc=$1; shift
+	local weight_rp12=$1; shift
+	local weight_rp13=$1; shift
+	local ports=${1-sp=1024,dp=0-32768}; shift
+
 	local t0_rp12 t0_rp13 t1_rp12 t1_rp13
 	local packets_rp12 packets_rp13
 
@@ -276,7 +281,7 @@ multipath6_test()
 	t0_rp13=$(link_stats_tx_packets_get $rp13)
 
 	$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
-		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+		-d $MZ_DELAY -t udp "$ports"
 	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
@@ -315,6 +320,23 @@ multipath_test()
 	multipath6_test "Weighted MP 11:45" 11 45
 }
 
+multipath16_test()
+{
+	check_nhgw16 104 || return
+
+	log_info "Running 16-bit IPv4 multipath tests"
+	multipath4_test "65535:65535" 65535 65535
+	multipath4_test "128:512" 128 512
+	omit_on_slow \
+		multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
+
+	log_info "Running 16-bit IPv6 multipath tests"
+	multipath6_test "65535:65535" 65535 65535
+	multipath6_test "128:512" 128 512
+	omit_on_slow \
+		multipath6_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
+}
+
 ping_ipv4_blackhole()
 {
 	RET=0
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
index 2903294d8bca..507b2852dabe 100644
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
@@ -117,3 +117,16 @@ __nh_stats_test_v6()
 			       $MZ -6 $h1 -A 2001:db8:1::2 -B 2001:db8:2::2
 	sysctl_restore net.ipv6.fib_multipath_hash_policy
 }
+
+check_nhgw16()
+{
+	local nhid=$1; shift
+
+	ip nexthop replace id 9999 group "$nhid,65535" &>/dev/null
+	if (( $? )); then
+		log_test_skip "16-bit multipath tests" \
+			      "iproute2 or the kernel do not support 16-bit next hop weights"
+		return 1
+	fi
+	ip nexthop del id 9999 ||:
+}
-- 
cgit v1.2.3


From dce0765c1d5bf810bf3cd8e66cf7f6e73d14d7e2 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 7 Aug 2024 16:13:50 +0200
Subject: selftests: router_mpath_nh_res: Test 16-bit next hop weights

Add tests that exercise full 16 bits of NH weight.

Like in the previous patch, omit the 255:65535 test when KSFT_MACHINE_SLOW.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/a91d6ead9d1b1b4b7e276ca58a71ef814f42b7dd.1723036486.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/forwarding/router_mpath_nh_res.sh          | 56 ++++++++++++++++++----
 1 file changed, 48 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
index bd35fe8be9aa..88ddae05b39d 100755
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
@@ -40,6 +40,7 @@ ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
 	multipath_test
+	multipath16_test
 	nh_stats_test_v4
 	nh_stats_test_v6
 "
@@ -228,9 +229,11 @@ routing_nh_obj()
 
 multipath4_test()
 {
-	local desc="$1"
-	local weight_rp12=$2
-	local weight_rp13=$3
+	local desc=$1; shift
+	local weight_rp12=$1; shift
+	local weight_rp13=$1; shift
+	local ports=${1-sp=1024,dp=0-32768}; shift
+
 	local t0_rp12 t0_rp13 t1_rp12 t1_rp13
 	local packets_rp12 packets_rp13
 
@@ -243,7 +246,7 @@ multipath4_test()
 	t0_rp13=$(link_stats_tx_packets_get $rp13)
 
 	ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
-		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+		-d $MZ_DELAY -t udp "$ports"
 	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
@@ -259,9 +262,11 @@ multipath4_test()
 
 multipath6_l4_test()
 {
-	local desc="$1"
-	local weight_rp12=$2
-	local weight_rp13=$3
+	local desc=$1; shift
+	local weight_rp12=$1; shift
+	local weight_rp13=$1; shift
+	local ports=${1-sp=1024,dp=0-32768}; shift
+
 	local t0_rp12 t0_rp13 t1_rp12 t1_rp13
 	local packets_rp12 packets_rp13
 
@@ -274,7 +279,7 @@ multipath6_l4_test()
 	t0_rp13=$(link_stats_tx_packets_get $rp13)
 
 	$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
-		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+		-d $MZ_DELAY -t udp "$ports"
 	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
@@ -373,6 +378,41 @@ multipath_test()
 	ip nexthop replace id 106 group 104,1/105,1 type resilient
 }
 
+multipath16_test()
+{
+	check_nhgw16 104 || return
+
+	log_info "Running 16-bit IPv4 multipath tests"
+	ip nexthop replace id 103 group 101/102 type resilient idle_timer 0
+
+	ip nexthop replace id 103 group 101,65535/102,65535 type resilient
+	multipath4_test "65535:65535" 65535 65535
+
+	ip nexthop replace id 103 group 101,128/102,512 type resilient
+	multipath4_test "128:512" 128 512
+
+	ip nexthop replace id 103 group 101,255/102,65535 type resilient
+	omit_on_slow \
+		multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
+
+	ip nexthop replace id 103 group 101,1/102,1 type resilient
+
+	log_info "Running 16-bit IPv6 L4 hash multipath tests"
+	ip nexthop replace id 106 group 104/105 type resilient idle_timer 0
+
+	ip nexthop replace id 106 group 104,65535/105,65535 type resilient
+	multipath6_l4_test "65535:65535" 65535 65535
+
+	ip nexthop replace id 106 group 104,128/105,512 type resilient
+	multipath6_l4_test "128:512" 128 512
+
+	ip nexthop replace id 106 group 104,255/105,65535 type resilient
+	omit_on_slow \
+		multipath6_l4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
+
+	ip nexthop replace id 106 group 104,1/105,1 type resilient
+}
+
 nh_stats_test_v4()
 {
 	__nh_stats_test_v4 resilient
-- 
cgit v1.2.3


From 4b808f44733243f981ae04046ef93a65ecc631a9 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 7 Aug 2024 16:13:51 +0200
Subject: selftests: fib_nexthops: Test 16-bit next hop weights

Add tests that attempt to create NH groups that use full 16 bits of NH
weight.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/101cdd3f2bfd9511c9bec95f909d20ff56f70ba5.1723036486.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_nexthops.sh | 55 ++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
index ac0b2c6a5761..77c83d9508d3 100755
--- a/tools/testing/selftests/net/fib_nexthops.sh
+++ b/tools/testing/selftests/net/fib_nexthops.sh
@@ -78,7 +78,12 @@ log_test()
 	else
 		ret=1
 		nfail=$((nfail+1))
-		printf "TEST: %-60s  [FAIL]\n" "${msg}"
+		if [[ $rc -eq $ksft_skip ]]; then
+			printf "TEST: %-60s  [SKIP]\n" "${msg}"
+		else
+			printf "TEST: %-60s  [FAIL]\n" "${msg}"
+		fi
+
 		if [ "$VERBOSE" = "1" ]; then
 			echo "    rc=$rc, expected $expected"
 		fi
@@ -923,6 +928,29 @@ ipv6_grp_fcnal()
 
 	ipv6_grp_refs
 	log_test $? 0 "Nexthop group replace refcounts"
+
+	#
+	# 16-bit weights.
+	#
+	run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+	run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+	run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+	run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
+	run_cmd "$IP nexthop add id 66 dev veth1"
+
+	run_cmd "$IP nexthop add id 103 group 62,1000"
+	if [[ $? == 0 ]]; then
+		local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535"
+		run_cmd "$IP nexthop replace $GRP"
+		check_nexthop "id 103" "$GRP"
+		rc=$?
+	else
+		rc=$ksft_skip
+	fi
+
+	$IP nexthop flush >/dev/null 2>&1
+
+	log_test $rc 0 "16-bit weights"
 }
 
 ipv6_res_grp_fcnal()
@@ -987,6 +1015,31 @@ ipv6_res_grp_fcnal()
 	check_nexthop_bucket "list id 102" \
 		"id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62"
 	log_test $? 0 "Nexthop buckets updated after replace - nECMP"
+
+	#
+	# 16-bit weights.
+	#
+	run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+	run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+	run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+	run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
+	run_cmd "$IP nexthop add id 66 dev veth1"
+
+	run_cmd "$IP nexthop add id 103 group 62,1000 type resilient buckets 32"
+	if [[ $? == 0 ]]; then
+		local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535 $(:
+			  )type resilient buckets 32 idle_timer 0 $(:
+			  )unbalanced_timer 0"
+		run_cmd "$IP nexthop replace $GRP"
+		check_nexthop "id 103" "$GRP unbalanced_time 0"
+		rc=$?
+	else
+		rc=$ksft_skip
+	fi
+
+	$IP nexthop flush >/dev/null 2>&1
+
+	log_test $rc 0 "16-bit weights"
 }
 
 ipv6_fcnal_runtime()
-- 
cgit v1.2.3


From 8db5cabcf1b6f9d62bd024aa6293de715d75518f Mon Sep 17 00:00:00 2001
From: Weilin Wang <weilin.wang@intel.com>
Date: Sat, 20 Jul 2024 02:20:56 -0400
Subject: perf stat: Fork and launch 'perf record' when 'perf stat' needs to
 get retire latency value for a metric.

When retire_latency value is used in a metric formula, evsel would fork
a 'perf record' process with "-e" and "-W" options. 'perf record' will
collect required retire_latency values in parallel while 'perf stat' is
collecting counting values.

At the point of time that 'perf stat' stops counting, evsel would stop
'perf record' by sending sigterm signal to 'perf record' process.
Sampled data will be processed to get retire latency value. Another
thread is required to synchronize between 'perf stat' and 'perf record'
when we pass data through pipe.

Retire_latency evsel is not opened for 'perf stat' so that there is no
counter wasted on it. This commit includes code suggested by Namhyung to
adjust reading size for groups that include retire_latency evsels.

In current :R parsing implementation, the parser would recognize events
with retire_latency modifier and insert them into the evlist like a
normal event.  Ideally, we need to avoid counting these events.

In this commit, at the time when a retire_latency evsel is read, set the
retire latency value processed from the sampled data to count value.
This sampled retire latency value will be used for metric calculation
and final event count print out. No special metric calculation and event
print out code required for retire_latency events.

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samantha Alt <samantha.alt@intel.com>
Link: https://lore.kernel.org/r/20240720062102.444578-4-weilin.wang@intel.com
[ Squashed the 3rd and 4th commit in the series to keep it building patch by patch ]
[ Constified the 'struct perf_tool' pointer in process_sample_event() ]
[ Use perf_tool__init(&tool, false) to address a segfault I reported and Ian/Weilin diagnosed ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/util/evlist.c |   6 +
 tools/perf/builtin-stat.c         |   4 +
 tools/perf/util/Build             |   1 +
 tools/perf/util/evlist.c          |   2 +
 tools/perf/util/evsel.c           |  81 ++++++-
 tools/perf/util/evsel.h           |   5 +
 tools/perf/util/intel-tpebs.c     | 432 ++++++++++++++++++++++++++++++++++++++
 tools/perf/util/intel-tpebs.h     |  35 +++
 8 files changed, 564 insertions(+), 2 deletions(-)
 create mode 100644 tools/perf/util/intel-tpebs.c
 create mode 100644 tools/perf/util/intel-tpebs.h

(limited to 'tools')

diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c
index b1ce0c52d88d..cebdd483149e 100644
--- a/tools/perf/arch/x86/util/evlist.c
+++ b/tools/perf/arch/x86/util/evlist.c
@@ -89,6 +89,12 @@ int arch_evlist__cmp(const struct evsel *lhs, const struct evsel *rhs)
 			return 1;
 	}
 
+	/* Retire latency event should not be group leader*/
+	if (lhs->retire_lat && !rhs->retire_lat)
+		return 1;
+	if (!lhs->retire_lat && rhs->retire_lat)
+		return -1;
+
 	/* Default ordering by insertion index. */
 	return lhs->core.idx - rhs->core.idx;
 }
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 487eff9a8582..6162b98fc941 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -70,6 +70,7 @@
 #include "util/bpf_counter.h"
 #include "util/iostat.h"
 #include "util/util.h"
+#include "util/intel-tpebs.h"
 #include "asm/bug.h"
 
 #include <linux/time64.h>
@@ -683,6 +684,9 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
 
 	if (child_pid != -1)
 		kill(child_pid, SIGTERM);
+
+	tpebs_delete();
+
 	return COUNTER_FATAL;
 }
 
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index f6f18b4228bb..7ea261416c14 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -156,6 +156,7 @@ perf-util-y += clockid.o
 perf-util-y += list_sort.o
 perf-util-y += mutex.o
 perf-util-y += sharded_mutex.o
+perf-util-$(CONFIG_X86_64) += intel-tpebs.o
 
 perf-util-$(CONFIG_LIBBPF) += bpf_map.o
 perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 1417f9a23083..4b2538e4f679 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -33,6 +33,7 @@
 #include "util/bpf-filter.h"
 #include "util/stat.h"
 #include "util/util.h"
+#include "util/intel-tpebs.h"
 #include <signal.h>
 #include <unistd.h>
 #include <sched.h>
@@ -179,6 +180,7 @@ void evlist__delete(struct evlist *evlist)
 	if (evlist == NULL)
 		return;
 
+	tpebs_delete();
 	evlist__free_stats(evlist);
 	evlist__munmap(evlist);
 	evlist__close(evlist);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index bc603193c477..d607056b73c9 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -59,6 +59,7 @@
 #include <internal/xyarray.h>
 #include <internal/lib.h>
 #include <internal/threadmap.h>
+#include "util/intel-tpebs.h"
 
 #include <linux/ctype.h>
 
@@ -1539,6 +1540,11 @@ static int evsel__read_one(struct evsel *evsel, int cpu_map_idx, int thread)
 	return perf_evsel__read(&evsel->core, cpu_map_idx, thread, count);
 }
 
+static int evsel__read_retire_lat(struct evsel *evsel, int cpu_map_idx, int thread)
+{
+	return tpebs_set_evsel(evsel, cpu_map_idx, thread);
+}
+
 static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread,
 			     u64 val, u64 ena, u64 run, u64 lost)
 {
@@ -1546,6 +1552,12 @@ static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread,
 
 	count = perf_counts(counter->counts, cpu_map_idx, thread);
 
+	if (counter->retire_lat) {
+		evsel__read_retire_lat(counter, cpu_map_idx, thread);
+		perf_counts__set_loaded(counter->counts, cpu_map_idx, thread, true);
+		return;
+	}
+
 	count->val    = val;
 	count->ena    = ena;
 	count->run    = run;
@@ -1554,6 +1566,60 @@ static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread,
 	perf_counts__set_loaded(counter->counts, cpu_map_idx, thread, true);
 }
 
+static bool evsel__group_has_tpebs(struct evsel *leader)
+{
+	struct evsel *evsel;
+
+	for_each_group_evsel(evsel, leader) {
+		if (evsel__is_retire_lat(evsel))
+			return true;
+	}
+	return false;
+}
+
+static u64 evsel__group_read_nr_members(struct evsel *leader)
+{
+	u64 nr = leader->core.nr_members;
+	struct evsel *evsel;
+
+	for_each_group_evsel(evsel, leader) {
+		if (evsel__is_retire_lat(evsel))
+			nr--;
+	}
+	return nr;
+}
+
+static u64 evsel__group_read_size(struct evsel *leader)
+{
+	u64 read_format = leader->core.attr.read_format;
+	int entry = sizeof(u64); /* value */
+	int size = 0;
+	int nr = 1;
+
+	if (!evsel__group_has_tpebs(leader))
+		return perf_evsel__read_size(&leader->core);
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		size += sizeof(u64);
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		size += sizeof(u64);
+
+	if (read_format & PERF_FORMAT_ID)
+		entry += sizeof(u64);
+
+	if (read_format & PERF_FORMAT_LOST)
+		entry += sizeof(u64);
+
+	if (read_format & PERF_FORMAT_GROUP) {
+		nr = evsel__group_read_nr_members(leader);
+		size += sizeof(u64);
+	}
+
+	size += entry * nr;
+	return size;
+}
+
 static int evsel__process_group_data(struct evsel *leader, int cpu_map_idx, int thread, u64 *data)
 {
 	u64 read_format = leader->core.attr.read_format;
@@ -1562,7 +1628,7 @@ static int evsel__process_group_data(struct evsel *leader, int cpu_map_idx, int
 
 	nr = *data++;
 
-	if (nr != (u64) leader->core.nr_members)
+	if (nr != evsel__group_read_nr_members(leader))
 		return -EINVAL;
 
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -1592,7 +1658,7 @@ static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread)
 {
 	struct perf_stat_evsel *ps = leader->stats;
 	u64 read_format = leader->core.attr.read_format;
-	int size = perf_evsel__read_size(&leader->core);
+	int size = evsel__group_read_size(leader);
 	u64 *data = ps->group_data;
 
 	if (!(read_format & PERF_FORMAT_ID))
@@ -1784,6 +1850,9 @@ int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
 	if (evsel__is_tool(evsel))
 		return evsel__read_tool(evsel, cpu_map_idx, thread);
 
+	if (evsel__is_retire_lat(evsel))
+		return evsel__read_retire_lat(evsel, cpu_map_idx, thread);
+
 	if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
 		return evsel__read_group(evsel, cpu_map_idx, thread);
 
@@ -2200,6 +2269,9 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
 		return 0;
 	}
 
+	if (evsel__is_retire_lat(evsel))
+		return tpebs_start(evsel->evlist);
+
 	err = __evsel__prepare_open(evsel, cpus, threads);
 	if (err)
 		return err;
@@ -2392,6 +2464,8 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
 
 void evsel__close(struct evsel *evsel)
 {
+	if (evsel__is_retire_lat(evsel))
+		tpebs_delete();
 	perf_evsel__close(&evsel->core);
 	perf_evsel__free_id(&evsel->core);
 }
@@ -3357,6 +3431,9 @@ static int store_evsel_ids(struct evsel *evsel, struct evlist *evlist)
 {
 	int cpu_map_idx, thread;
 
+	if (evsel__is_retire_lat(evsel))
+		return 0;
+
 	for (cpu_map_idx = 0; cpu_map_idx < xyarray__max_x(evsel->core.fd); cpu_map_idx++) {
 		for (thread = 0; thread < xyarray__max_y(evsel->core.fd);
 		     thread++) {
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 14f777b9e03e..a5da4b03bb1c 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -311,6 +311,11 @@ static inline bool evsel__is_tool(const struct evsel *evsel)
 	return evsel->tool_event != PERF_TOOL_NONE;
 }
 
+static inline bool evsel__is_retire_lat(const struct evsel *evsel)
+{
+	return evsel->retire_lat;
+}
+
 const char *evsel__group_name(struct evsel *evsel);
 int evsel__group_desc(struct evsel *evsel, char *buf, size_t size);
 
diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
new file mode 100644
index 000000000000..50a3c3e07160
--- /dev/null
+++ b/tools/perf/util/intel-tpebs.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * intel_tpebs.c: Intel TPEBS support
+ */
+
+
+#include <sys/param.h>
+#include <subcmd/run-command.h>
+#include <thread.h>
+#include "intel-tpebs.h"
+#include <linux/list.h>
+#include <linux/zalloc.h>
+#include <linux/err.h>
+#include "sample.h"
+#include "debug.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "session.h"
+#include "tool.h"
+#include "cpumap.h"
+#include "metricgroup.h"
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <poll.h>
+#include <math.h>
+
+#define PERF_DATA		"-"
+
+bool tpebs_recording;
+static pid_t tpebs_pid = -1;
+static size_t tpebs_event_size;
+static LIST_HEAD(tpebs_results);
+static pthread_t tpebs_reader_thread;
+static struct child_process *tpebs_cmd;
+
+struct tpebs_retire_lat {
+	struct list_head nd;
+	/* Event name */
+	const char *name;
+	/* Event name with the TPEBS modifier R */
+	const char *tpebs_name;
+	/* Count of retire_latency values found in sample data */
+	size_t count;
+	/* Sum of all the retire_latency values in sample data */
+	int sum;
+	/* Average of retire_latency, val = sum / count */
+	double val;
+};
+
+static int get_perf_record_args(const char **record_argv, char buf[],
+				const char *cpumap_buf)
+{
+	struct tpebs_retire_lat *e;
+	int i = 0;
+
+	pr_debug("tpebs: Prepare perf record for retire_latency\n");
+
+	record_argv[i++] = "perf";
+	record_argv[i++] = "record";
+	record_argv[i++] = "-W";
+	record_argv[i++] = "--synth=no";
+	record_argv[i++] = buf;
+
+	if (!cpumap_buf) {
+		pr_err("tpebs: Require cpumap list to run sampling\n");
+		return -ECANCELED;
+	}
+	/* Use -C when cpumap_buf is not "-1" */
+	if (strcmp(cpumap_buf, "-1")) {
+		record_argv[i++] = "-C";
+		record_argv[i++] = cpumap_buf;
+	}
+
+	list_for_each_entry(e, &tpebs_results, nd) {
+		record_argv[i++] = "-e";
+		record_argv[i++] = e->name;
+	}
+
+	record_argv[i++] = "-o";
+	record_argv[i++] = PERF_DATA;
+
+	return 0;
+}
+
+static int prepare_run_command(const char **argv)
+{
+	tpebs_cmd = zalloc(sizeof(struct child_process));
+	if (!tpebs_cmd)
+		return -ENOMEM;
+	tpebs_cmd->argv = argv;
+	tpebs_cmd->out = -1;
+	return 0;
+}
+
+static int start_perf_record(int control_fd[], int ack_fd[],
+				const char *cpumap_buf)
+{
+	const char **record_argv;
+	int ret;
+	char buf[32];
+
+	scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0], ack_fd[1]);
+
+	record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *));
+	if (!record_argv)
+		return -ENOMEM;
+
+	ret = get_perf_record_args(record_argv, buf, cpumap_buf);
+	if (ret)
+		goto out;
+
+	ret = prepare_run_command(record_argv);
+	if (ret)
+		goto out;
+	ret = start_command(tpebs_cmd);
+out:
+	free(record_argv);
+	return ret;
+}
+
+static int process_sample_event(const struct perf_tool *tool __maybe_unused,
+				union perf_event *event __maybe_unused,
+				struct perf_sample *sample,
+				struct evsel *evsel,
+				struct machine *machine __maybe_unused)
+{
+	int ret = 0;
+	const char *evname;
+	struct tpebs_retire_lat *t;
+
+	evname = evsel__name(evsel);
+
+	/*
+	 * Need to handle per core results? We are assuming average retire
+	 * latency value will be used. Save the number of samples and the sum of
+	 * retire latency value for each event.
+	 */
+	list_for_each_entry(t, &tpebs_results, nd) {
+		if (!strcmp(evname, t->name)) {
+			t->count += 1;
+			t->sum += sample->retire_lat;
+			t->val = (double) t->sum / t->count;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int process_feature_event(struct perf_session *session,
+				 union perf_event *event)
+{
+	if (event->feat.feat_id < HEADER_LAST_FEATURE)
+		return perf_event__process_feature(session, event);
+	return 0;
+}
+
+static void *__sample_reader(void *arg)
+{
+	struct child_process *child = arg;
+	struct perf_session *session;
+	struct perf_data data = {
+		.mode = PERF_DATA_MODE_READ,
+		.path = PERF_DATA,
+		.file.fd = child->out,
+	};
+	struct perf_tool tool;
+
+	perf_tool__init(&tool, /*ordered_events=*/false);
+	tool.sample = process_sample_event;
+	tool.feature = process_feature_event;
+	tool.attr = perf_event__process_attr;
+
+	session = perf_session__new(&data, &tool);
+	if (IS_ERR(session))
+		return NULL;
+	perf_session__process_events(session);
+	perf_session__delete(session);
+
+	return NULL;
+}
+
+/*
+ * tpebs_stop - stop the sample data read thread and the perf record process.
+ */
+static int tpebs_stop(void)
+{
+	int ret = 0;
+
+	/* Like tpebs_start, we should only run tpebs_end once. */
+	if (tpebs_pid != -1) {
+		kill(tpebs_cmd->pid, SIGTERM);
+		tpebs_pid = -1;
+		pthread_join(tpebs_reader_thread, NULL);
+		close(tpebs_cmd->out);
+		ret = finish_command(tpebs_cmd);
+		if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+			ret = 0;
+	}
+	return ret;
+}
+
+/*
+ * tpebs_start - start tpebs execution.
+ * @evsel_list: retire_latency evsels in this list will be selected and sampled
+ * to get the average retire_latency value.
+ *
+ * This function will be called from evlist level later when evlist__open() is
+ * called consistently.
+ */
+int tpebs_start(struct evlist *evsel_list)
+{
+	int ret = 0;
+	struct evsel *evsel;
+	char cpumap_buf[50];
+
+	/*
+	 * We should only run tpebs_start when tpebs_recording is enabled.
+	 * And we should only run it once with all the required events.
+	 */
+	if (tpebs_pid != -1 || !tpebs_recording)
+		return 0;
+
+	cpu_map__snprint(evsel_list->core.user_requested_cpus, cpumap_buf, sizeof(cpumap_buf));
+	/*
+	 * Prepare perf record for sampling event retire_latency before fork and
+	 * prepare workload
+	 */
+	evlist__for_each_entry(evsel_list, evsel) {
+		int i;
+		char *name;
+		struct tpebs_retire_lat *new;
+
+		if (!evsel->retire_lat)
+			continue;
+
+		pr_debug("tpebs: Retire_latency of event %s is required\n", evsel->name);
+		for (i = strlen(evsel->name) - 1; i > 0; i--) {
+			if (evsel->name[i] == 'R')
+				break;
+		}
+		if (i <= 0 || evsel->name[i] != 'R') {
+			ret = -1;
+			goto err;
+		}
+
+		name = strdup(evsel->name);
+		if (!name) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		name[i] = 'p';
+
+		new = zalloc(sizeof(*new));
+		if (!new) {
+			ret = -1;
+			zfree(name);
+			goto err;
+		}
+		new->name = name;
+		new->tpebs_name = evsel->name;
+		list_add_tail(&new->nd, &tpebs_results);
+		tpebs_event_size += 1;
+	}
+
+	if (tpebs_event_size > 0) {
+		struct pollfd pollfd = { .events = POLLIN, };
+		int control_fd[2], ack_fd[2], len;
+		char ack_buf[8];
+
+		/*Create control and ack fd for --control*/
+		if (pipe(control_fd) < 0) {
+			pr_err("tpebs: Failed to create control fifo");
+			ret = -1;
+			goto out;
+		}
+		if (pipe(ack_fd) < 0) {
+			pr_err("tpebs: Failed to create control fifo");
+			ret = -1;
+			goto out;
+		}
+
+		ret = start_perf_record(control_fd, ack_fd, cpumap_buf);
+		if (ret)
+			goto out;
+		tpebs_pid = tpebs_cmd->pid;
+		if (pthread_create(&tpebs_reader_thread, NULL, __sample_reader, tpebs_cmd)) {
+			kill(tpebs_cmd->pid, SIGTERM);
+			close(tpebs_cmd->out);
+			pr_err("Could not create thread to process sample data.\n");
+			ret = -1;
+			goto out;
+		}
+		/* Wait for perf record initialization.*/
+		len = strlen(EVLIST_CTL_CMD_ENABLE_TAG);
+		ret = write(control_fd[1], EVLIST_CTL_CMD_ENABLE_TAG, len);
+		if (ret != len) {
+			pr_err("perf record control write control message failed\n");
+			goto out;
+		}
+
+		/* wait for an ack */
+		pollfd.fd = ack_fd[0];
+
+		/*
+		 * We need this poll to ensure the ack_fd PIPE will not hang
+		 * when perf record failed for any reason. The timeout value
+		 * 3000ms is an empirical selection.
+		 */
+		if (!poll(&pollfd, 1, 3000)) {
+			pr_err("tpebs failed: perf record ack timeout\n");
+			ret = -1;
+			goto out;
+		}
+
+		if (!(pollfd.revents & POLLIN)) {
+			pr_err("tpebs failed: did not received an ack\n");
+			ret = -1;
+			goto out;
+		}
+
+		ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
+		if (ret > 0)
+			ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
+		else {
+			pr_err("tpebs: perf record control ack failed\n");
+			goto out;
+		}
+out:
+		close(control_fd[0]);
+		close(control_fd[1]);
+		close(ack_fd[0]);
+		close(ack_fd[1]);
+	}
+err:
+	if (ret)
+		tpebs_delete();
+	return ret;
+}
+
+
+int tpebs_set_evsel(struct evsel *evsel, int cpu_map_idx, int thread)
+{
+	__u64 val;
+	bool found = false;
+	struct tpebs_retire_lat *t;
+	struct perf_counts_values *count;
+
+	/* Non reitre_latency evsel should never enter this function. */
+	if (!evsel__is_retire_lat(evsel))
+		return -1;
+
+	/*
+	 * Need to stop the forked record to ensure get sampled data from the
+	 * PIPE to process and get non-zero retire_lat value for hybrid.
+	 */
+	tpebs_stop();
+	count = perf_counts(evsel->counts, cpu_map_idx, thread);
+
+	list_for_each_entry(t, &tpebs_results, nd) {
+		if (t->tpebs_name == evsel->name ||
+		    (evsel->metric_id && !strcmp(t->tpebs_name, evsel->metric_id))) {
+			found = true;
+			break;
+		}
+	}
+
+	/* Set ena and run to non-zero */
+	count->ena = count->run = 1;
+	count->lost = 0;
+
+	if (!found) {
+		/*
+		 * Set default value or 0 when retire_latency for this event is
+		 * not found from sampling data (record_tpebs not set or 0
+		 * sample recorded).
+		 */
+		count->val = 0;
+		return 0;
+	}
+
+	/*
+	 * Only set retire_latency value to the first CPU and thread.
+	 */
+	if (cpu_map_idx == 0 && thread == 0)
+		val = rint(t->val);
+	else
+		val = 0;
+
+	count->val = val;
+	return 0;
+}
+
+static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
+{
+	zfree(&r->name);
+	free(r);
+}
+
+
+/*
+ * tpebs_delete - delete tpebs related data and stop the created thread and
+ * process by calling tpebs_stop().
+ *
+ * This function is called from evlist_delete() and also from builtin-stat
+ * stat_handle_error(). If tpebs_start() is called from places other then perf
+ * stat, need to ensure tpebs_delete() is also called to safely free mem and
+ * close the data read thread and the forked perf record process.
+ *
+ * This function is also called in evsel__close() to be symmetric with
+ * tpebs_start() being called in evsel__open(). We will update this call site
+ * when move tpebs_start() to evlist level.
+ */
+void tpebs_delete(void)
+{
+	struct tpebs_retire_lat *r, *rtmp;
+
+	if (tpebs_pid == -1)
+		return;
+
+	tpebs_stop();
+
+	list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) {
+		list_del_init(&r->nd);
+		tpebs_retire_lat__delete(r);
+	}
+
+	if (tpebs_cmd) {
+		free(tpebs_cmd);
+		tpebs_cmd = NULL;
+	}
+}
diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
new file mode 100644
index 000000000000..766b3fbd79f1
--- /dev/null
+++ b/tools/perf/util/intel-tpebs.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * intel_tpebs.h: Intel TEPBS support
+ */
+#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
+#define INCLUDE__PERF_INTEL_TPEBS_H__
+
+#include "stat.h"
+#include "evsel.h"
+
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+
+extern bool tpebs_recording;
+int tpebs_start(struct evlist *evsel_list);
+void tpebs_delete(void);
+int tpebs_set_evsel(struct evsel *evsel, int cpu_map_idx, int thread);
+
+#else
+
+static inline int tpebs_start(struct evlist *evsel_list __maybe_unused)
+{
+	return 0;
+}
+
+static inline void tpebs_delete(void) {};
+
+static inline int tpebs_set_evsel(struct evsel *evsel  __maybe_unused,
+				int cpu_map_idx  __maybe_unused,
+				int thread  __maybe_unused)
+{
+	return 0;
+}
+
+#endif
+#endif
-- 
cgit v1.2.3


From 0a7381601b8a55d44c97ea132e23876c0dd9f90e Mon Sep 17 00:00:00 2001
From: Weilin Wang <weilin.wang@intel.com>
Date: Sat, 20 Jul 2024 02:20:58 -0400
Subject: perf vendor events intel: Add MTL metric JSON files

Add MTL metric JSON file for TMA4.8. Some of the metrics' formulas use TPEBS
retire_latency in MTL.

This also includes lated E-Core TMA3.6 changes.

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samantha Alt <samantha.alt@intel.com>
Link: https://lore.kernel.org/r/20240720062102.444578-6-weilin.wang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../arch/x86/meteorlake/metricgroups.json          |  142 ++
 .../arch/x86/meteorlake/mtl-metrics.json           | 2535 ++++++++++++++++++++
 2 files changed, 2677 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/x86/meteorlake/metricgroups.json
 create mode 100644 tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/meteorlake/metricgroups.json
new file mode 100644
index 000000000000..b54a5fc0861f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/metricgroups.json
@@ -0,0 +1,142 @@
+{
+    "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvBC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvBO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvCB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvFB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvIO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvMB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvML": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvMP": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvMS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvOB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvUW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "C0Wait": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Ifetch": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "IntVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Load_Store_Miss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Mem_Exec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "TopdownL1": "Metrics for top-down breakdown at level 1",
+    "TopdownL2": "Metrics for top-down breakdown at level 2",
+    "TopdownL3": "Metrics for top-down breakdown at level 3",
+    "TopdownL4": "Metrics for top-down breakdown at level 4",
+    "TopdownL5": "Metrics for top-down breakdown at level 5",
+    "TopdownL6": "Metrics for top-down breakdown at level 6",
+    "load_store_bound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "tma_L1_group": "Metrics for top-down breakdown at level 1",
+    "tma_L2_group": "Metrics for top-down breakdown at level 2",
+    "tma_L3_group": "Metrics for top-down breakdown at level 3",
+    "tma_L4_group": "Metrics for top-down breakdown at level 4",
+    "tma_L5_group": "Metrics for top-down breakdown at level 5",
+    "tma_L6_group": "Metrics for top-down breakdown at level 6",
+    "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category",
+    "tma_assists_group": "Metrics contributing to tma_assists category",
+    "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
+    "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
+    "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
+    "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
+    "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
+    "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category",
+    "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category",
+    "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category",
+    "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category",
+    "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category",
+    "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category",
+    "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category",
+    "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
+    "tma_ifetch_bandwidth_group": "Metrics contributing to tma_ifetch_bandwidth category",
+    "tma_ifetch_latency_group": "Metrics contributing to tma_ifetch_latency category",
+    "tma_int_operations_group": "Metrics contributing to tma_int_operations category",
+    "tma_issue2P": "Metrics related by the issue $issue2P",
+    "tma_issueBM": "Metrics related by the issue $issueBM",
+    "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
+    "tma_issueD0": "Metrics related by the issue $issueD0",
+    "tma_issueFB": "Metrics related by the issue $issueFB",
+    "tma_issueFL": "Metrics related by the issue $issueFL",
+    "tma_issueL1": "Metrics related by the issue $issueL1",
+    "tma_issueLat": "Metrics related by the issue $issueLat",
+    "tma_issueMC": "Metrics related by the issue $issueMC",
+    "tma_issueMS": "Metrics related by the issue $issueMS",
+    "tma_issueMV": "Metrics related by the issue $issueMV",
+    "tma_issueRFO": "Metrics related by the issue $issueRFO",
+    "tma_issueSL": "Metrics related by the issue $issueSL",
+    "tma_issueSO": "Metrics related by the issue $issueSO",
+    "tma_issueSmSt": "Metrics related by the issue $issueSmSt",
+    "tma_issueSpSt": "Metrics related by the issue $issueSpSt",
+    "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn",
+    "tma_issueTLB": "Metrics related by the issue $issueTLB",
+    "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category",
+    "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
+    "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
+    "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
+    "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
+    "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
+    "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
+    "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
+    "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
+    "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
+    "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
+    "tma_resource_bound_group": "Metrics contributing to tma_resource_bound category",
+    "tma_retiring_group": "Metrics contributing to tma_retiring category",
+    "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category",
+    "tma_store_bound_group": "Metrics contributing to tma_store_bound category",
+    "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category"
+}
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json b/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json
new file mode 100644
index 000000000000..1a7392f0da86
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json
@@ -0,0 +1,2535 @@
+[
+    {
+        "BriefDescription": "C10 residency percent per package",
+        "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C10_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C1 residency percent per core",
+        "MetricExpr": "cstate_core@c1\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C1_Core_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C2 residency percent per package",
+        "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C2_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C3 residency percent per package",
+        "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C3_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C6 residency percent per core",
+        "MetricExpr": "cstate_core@c6\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C6_Core_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C6 residency percent per package",
+        "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C6_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C7 residency percent per core",
+        "MetricExpr": "cstate_core@c7\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C7_Core_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C7 residency percent per package",
+        "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C7_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C8 residency percent per package",
+        "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C8_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C9 residency percent per package",
+        "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C9_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Percentage of cycles spent in System Management Interrupts.",
+        "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)",
+        "MetricGroup": "smi",
+        "MetricName": "smi_cycles",
+        "MetricThreshold": "smi_cycles > 0.1",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Number of SMI interrupts.",
+        "MetricExpr": "msr@smi@",
+        "MetricGroup": "smi",
+        "MetricName": "smi_num",
+        "ScaleUnit": "1SMI#"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to certain allocation restrictions",
+        "MetricExpr": "tma_core_bound",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group",
+        "MetricName": "tma_allocation_restriction",
+        "MetricThreshold": "tma_allocation_restriction > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.1)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls",
+        "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.ALL_P@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL1;tma_L1_group",
+        "MetricName": "tma_backend_bound",
+        "MetricThreshold": "tma_backend_bound > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear",
+        "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.ALL_P@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL1;tma_L1_group",
+        "MetricName": "tma_bad_speculation",
+        "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.BRANCH_DETECT@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group",
+        "MetricName": "tma_branch_detect",
+        "MetricThreshold": "tma_branch_detect > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)",
+        "PublicDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to branch mispredicts",
+        "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.MISPREDICT@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
+        "MetricName": "tma_branch_mispredicts",
+        "MetricThreshold": "tma_branch_mispredicts > 0.05 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.BRANCH_RESTEER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group",
+        "MetricName": "tma_branch_resteer",
+        "MetricThreshold": "tma_branch_resteer > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to the microcode sequencer (MS).",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.CISC@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group",
+        "MetricName": "tma_cisc",
+        "MetricThreshold": "tma_cisc > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles due to backend bound stalls that are bounded by core restrictions and not attributed to an outstanding load or stores, or resource limitation",
+        "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
+        "MetricName": "tma_core_bound",
+        "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to decode stalls.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.DECODE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group",
+        "MetricName": "tma_decode",
+        "MetricThreshold": "tma_decode > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear that does not require the use of microcode, classified as a fast nuke, due to memory ordering, memory disambiguation and memory renaming",
+        "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.FASTNUKE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_fast_nuke",
+        "MetricThreshold": "tma_fast_nuke > 0.05 & (tma_machine_clears > 0.05 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ALL_P@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL1;tma_L1_group",
+        "MetricName": "tma_frontend_bound",
+        "MetricThreshold": "tma_frontend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to instruction cache misses.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ICACHE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group",
+        "MetricName": "tma_icache_misses",
+        "MetricThreshold": "tma_icache_misses > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
+        "MetricName": "tma_ifetch_bandwidth",
+        "MetricThreshold": "tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend latency restrictions due to icache misses, itlb misses, branch detection, and resteer limitations.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.FRONTEND_LATENCY@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
+        "MetricName": "tma_ifetch_latency",
+        "MetricThreshold": "tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per Floating Point (FP) Operation",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@FP_FLOPS_RETIRED.ALL@",
+        "MetricGroup": "Flops",
+        "MetricName": "tma_info_arith_inst_mix_ipflop",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / (cpu_atom@FP_INST_RETIRED.128B_DP@ + cpu_atom@FP_INST_RETIRED.128B_SP@)",
+        "MetricGroup": "Flops",
+        "MetricName": "tma_info_arith_inst_mix_ipfparith_avx128",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@FP_INST_RETIRED.64B_DP@",
+        "MetricGroup": "Flops",
+        "MetricName": "tma_info_arith_inst_mix_ipfparith_scalar_dp",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@FP_INST_RETIRED.32B_SP@",
+        "MetricGroup": "Flops",
+        "MetricName": "tma_info_arith_inst_mix_ipfparith_scalar_sp",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of time that retirement is stalled due to a first level data TLB miss",
+        "MetricExpr": "100 * (cpu_atom@LD_HEAD.DTLB_MISS_AT_RET@ + cpu_atom@LD_HEAD.PGWALK_AT_RET@) / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricName": "tma_info_bottleneck_%_dtlb_miss_bound_cycles",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of time that allocation and retirement is stalled by the Frontend Cluster due to an Ifetch Miss, either Icache or ITLB Miss",
+        "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_IFETCH.ALL@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricGroup": "Ifetch",
+        "MetricName": "tma_info_bottleneck_%_ifetch_miss_bound_cycles",
+        "PublicDescription": "Percentage of time that allocation and retirement is stalled by the Frontend Cluster due to an Ifetch Miss, either Icache or ITLB Miss. See Info.Ifetch_Bound",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of time that retirement is stalled due to an L1 miss",
+        "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricGroup": "Load_Store_Miss",
+        "MetricName": "tma_info_bottleneck_%_load_miss_bound_cycles",
+        "PublicDescription": "Percentage of time that retirement is stalled due to an L1 miss. See Info.Load_Miss_Bound",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of time that retirement is stalled by the Memory Cluster due to a pipeline stall",
+        "MetricExpr": "100 * cpu_atom@LD_HEAD.ANY_AT_RET@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricGroup": "Mem_Exec",
+        "MetricName": "tma_info_bottleneck_%_mem_exec_bound_cycles",
+        "PublicDescription": "Percentage of time that retirement is stalled by the Memory Cluster due to a pipeline stall. See Info.Mem_Exec_Bound",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.ALL_BRANCHES@",
+        "MetricName": "tma_info_br_inst_mix_ipbranch",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instruction per (near) call (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.NEAR_CALL@",
+        "MetricName": "tma_info_br_inst_mix_ipcall",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.FAR_BRANCH@u",
+        "MetricName": "tma_info_br_inst_mix_ipfarbranch",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was not taken",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / (cpu_atom@BR_MISP_RETIRED.COND@ - cpu_atom@BR_MISP_RETIRED.COND_TAKEN@)",
+        "MetricName": "tma_info_br_inst_mix_ipmisp_cond_ntaken",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was taken",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.COND_TAKEN@",
+        "MetricName": "tma_info_br_inst_mix_ipmisp_cond_taken",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per retired indirect call or jump Branch Misprediction",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.INDIRECT@",
+        "MetricName": "tma_info_br_inst_mix_ipmisp_indirect",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per retired return Branch Misprediction",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.RETURN@",
+        "MetricName": "tma_info_br_inst_mix_ipmisp_ret",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per retired Branch Misprediction",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@",
+        "MetricName": "tma_info_br_inst_mix_ipmispredict",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Ratio of all branches which mispredict",
+        "MetricExpr": "cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@ / cpu_atom@BR_INST_RETIRED.ALL_BRANCHES@",
+        "MetricName": "tma_info_br_mispredict_bound_branch_mispredict_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Ratio between Mispredicted branches and unknown branches",
+        "MetricExpr": "cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@ / cpu_atom@BACLEARS.ANY@",
+        "MetricName": "tma_info_br_mispredict_bound_branch_mispredict_to_unknown_branch_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of time that allocation is stalled due to load buffer full",
+        "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.LD_BUF@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricName": "tma_info_buffer_stalls_%_load_buffer_stall_cycles",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of time that allocation is stalled due to memory reservation stations full",
+        "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.RSV@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricName": "tma_info_buffer_stalls_%_mem_rsv_stall_cycles",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of time that allocation is stalled due to store buffer full",
+        "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.ST_BUF@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricName": "tma_info_buffer_stalls_%_store_buffer_stall_cycles",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Cycles Per Instruction",
+        "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@ / cpu_atom@INST_RETIRED.ANY@",
+        "MetricName": "tma_info_core_cpi",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricExpr": "cpu_atom@FP_FLOPS_RETIRED.ALL@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricGroup": "Flops",
+        "MetricName": "tma_info_core_flopc",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions Per Cycle",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricName": "tma_info_core_ipc",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Uops Per Instruction",
+        "MetricExpr": "cpu_atom@TOPDOWN_RETIRING.ALL_P@ / cpu_atom@INST_RETIRED.ANY@",
+        "MetricName": "tma_info_core_upi",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss hits in the L2",
+        "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_IFETCH.L2_HIT@ / cpu_atom@MEM_BOUND_STALLS_IFETCH.ALL@",
+        "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l2hit",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss hits in the L3",
+        "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_IFETCH.LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS_IFETCH.ALL@",
+        "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l3hit",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss subsequently misses in the L3",
+        "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_IFETCH.LLC_MISS@ / cpu_atom@MEM_BOUND_STALLS_IFETCH.ALL@",
+        "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l3miss",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that hit the L2",
+        "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_LOAD.L2_HIT@ / cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@",
+        "MetricGroup": "load_store_bound",
+        "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l2hit",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that hit the L3",
+        "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_LOAD.LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@",
+        "MetricGroup": "load_store_bound",
+        "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l3hit",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that subsequently misses the L3",
+        "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_LOAD.LLC_MISS@ / cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@",
+        "MetricGroup": "load_store_bound",
+        "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l3miss",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a pipeline block",
+        "MetricExpr": "100 * cpu_atom@LD_HEAD.L1_BOUND_AT_RET@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricGroup": "load_store_bound",
+        "MetricName": "tma_info_load_store_bound_l1_bound",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement",
+        "MetricExpr": "100 * (cpu_atom@LD_HEAD.L1_BOUND_AT_RET@ + cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@) / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricGroup": "load_store_bound",
+        "MetricName": "tma_info_load_store_bound_load_bound",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full",
+        "MetricExpr": "100 * (cpu_atom@MEM_SCHEDULER_BLOCK.ST_BUF@ / cpu_atom@MEM_SCHEDULER_BLOCK.ALL@) * tma_mem_scheduler",
+        "MetricGroup": "load_store_bound",
+        "MetricName": "tma_info_load_store_bound_store_bound",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to floating point assists",
+        "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.FP_ASSIST@ / cpu_atom@INST_RETIRED.ANY@",
+        "MetricName": "tma_info_machine_clear_bound_machine_clears_fp_assist_pki",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to page faults",
+        "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.PAGE_FAULT@ / cpu_atom@INST_RETIRED.ANY@",
+        "MetricName": "tma_info_machine_clear_bound_machine_clears_page_fault_pki",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to self-modifying code",
+        "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.SMC@ / cpu_atom@INST_RETIRED.ANY@",
+        "MetricName": "tma_info_machine_clear_bound_machine_clears_smc_pki",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of total non-speculative loads with an address aliasing block",
+        "MetricExpr": "100 * cpu_atom@LD_BLOCKS.ADDRESS_ALIAS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+        "MetricName": "tma_info_mem_exec_blocks_%_loads_with_adressaliasing",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block",
+        "MetricExpr": "100 * cpu_atom@LD_BLOCKS.DATA_UNKNOWN@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+        "MetricName": "tma_info_mem_exec_blocks_%_loads_with_storefwdblk",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of Memory Execution Bound due to a first level data cache miss",
+        "MetricExpr": "100 * cpu_atom@LD_HEAD.L1_MISS_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+        "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_l1miss",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of Memory Execution Bound due to other block cases, such as pipeline conflicts, fences, etc",
+        "MetricExpr": "100 * cpu_atom@LD_HEAD.OTHER_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+        "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_otherpipelineblks",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of Memory Execution Bound due to a pagewalk",
+        "MetricExpr": "100 * cpu_atom@LD_HEAD.PGWALK_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+        "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_pagewalk",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of Memory Execution Bound due to a second level TLB miss",
+        "MetricExpr": "100 * cpu_atom@LD_HEAD.DTLB_MISS_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+        "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_stlbhit",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of Memory Execution Bound due to a store forward address match",
+        "MetricExpr": "100 * cpu_atom@LD_HEAD.ST_ADDR_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+        "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_storefwding",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per Load",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+        "MetricName": "tma_info_mem_mix_ipload",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Instructions per Store",
+        "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@MEM_UOPS_RETIRED.ALL_STORES@",
+        "MetricName": "tma_info_mem_mix_ipstore",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of total non-speculative loads that perform one or more locks",
+        "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.LOCK_LOADS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+        "MetricName": "tma_info_mem_mix_load_locks_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of total non-speculative loads that are splits",
+        "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.SPLIT_LOADS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+        "MetricName": "tma_info_mem_mix_load_splits_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Ratio of mem load uops to all uops",
+        "MetricExpr": "1e3 * cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@",
+        "MetricName": "tma_info_mem_mix_memload_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of time that the core is stalled due to a TPAUSE or UMWAIT instruction",
+        "MetricExpr": "100 * cpu_atom@SERIALIZATION.C01_MS_SCB@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricName": "tma_info_serialization _%_tpause_cycles",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / TSC",
+        "MetricName": "tma_info_system_cpu_utilization",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Giga Floating Point Operations Per Second",
+        "MetricExpr": "cpu_atom@FP_FLOPS_RETIRED.ALL@ / (duration_time * 1e9)",
+        "MetricGroup": "Flops",
+        "MetricName": "tma_info_system_gflops",
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
+        "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE_P@k / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_kernel_utilization",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
+        "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@ / cpu_atom@CPU_CLK_UNHALTED.REF_TSC@",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_system_turbo_utilization",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of all uops which are FPDiv uops",
+        "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.FPDIV@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@",
+        "MetricName": "tma_info_uop_mix_fpdiv_uop_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of all uops which are IDiv uops",
+        "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.IDIV@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@",
+        "MetricName": "tma_info_uop_mix_idiv_uop_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of all uops which are microcode ops",
+        "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.MS@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@",
+        "MetricName": "tma_info_uop_mix_microcode_uop_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Percentage of all uops which are x87 uops",
+        "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.X87@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@",
+        "MetricName": "tma_info_uop_mix_x87_uop_ratio",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ITLB_MISS@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group",
+        "MetricName": "tma_itlb_misses",
+        "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation",
+        "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
+        "MetricName": "tma_machine_clears",
+        "MetricThreshold": "tma_machine_clears > 0.05 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops",
+        "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.MEM_SCHEDULER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+        "MetricName": "tma_mem_scheduler",
+        "MetricThreshold": "tma_mem_scheduler > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops",
+        "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+        "MetricName": "tma_non_mem_scheduler",
+        "MetricThreshold": "tma_non_mem_scheduler > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear that requires the use of microcode (slow nuke)",
+        "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.NUKE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_nuke",
+        "MetricThreshold": "tma_nuke > 0.05 & (tma_machine_clears > 0.05 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to other common frontend stalls not categorized.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.OTHER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group",
+        "MetricName": "tma_other_fb",
+        "MetricThreshold": "tma_other_fb > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to wrong predecodes.",
+        "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.PREDECODE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group",
+        "MetricName": "tma_predecode",
+        "MetricThreshold": "tma_predecode > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls)",
+        "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.REGISTER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+        "MetricName": "tma_register",
+        "MetricThreshold": "tma_register > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the reorder buffer being full (ROB stalls)",
+        "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.REORDER_BUFFER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+        "MetricName": "tma_reorder_buffer",
+        "MetricThreshold": "tma_reorder_buffer > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a resource limitation",
+        "MetricExpr": "tma_backend_bound - tma_core_bound",
+        "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
+        "MetricName": "tma_resource_bound",
+        "MetricThreshold": "tma_resource_bound > 0.2 & tma_backend_bound > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that result in retirement slots",
+        "MetricExpr": "cpu_atom@TOPDOWN_RETIRING.ALL_P@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL1;tma_L1_group",
+        "MetricName": "tma_retiring",
+        "MetricThreshold": "tma_retiring > 0.75",
+        "MetricgroupNoGroup": "TopdownL1",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS)",
+        "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.SERIALIZATION@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+        "MetricName": "tma_serialization",
+        "MetricThreshold": "tma_serialization > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Uncore frequency per die [GHZ]",
+        "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9",
+        "MetricGroup": "SoC",
+        "MetricName": "UNCORE_FREQ",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
+        "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_0@ + cpu_core@UOPS_DISPATCHED.PORT_1@ + cpu_core@UOPS_DISPATCHED.PORT_5_11@ + cpu_core@UOPS_DISPATCHED.PORT_6@) / (5 * tma_info_core_core_clks)",
+        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
+        "MetricName": "tma_alu_op_utilization",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
+        "MetricExpr": "78 * cpu_core@ASSISTS.ANY@ / tma_info_thread_slots",
+        "MetricGroup": "BvIO;TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
+        "MetricName": "tma_assists",
+        "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
+        "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists.",
+        "MetricExpr": "63 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_avx_assists",
+        "MetricThreshold": "tma_avx_assists > 0.1",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots",
+        "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group",
+        "MetricName": "tma_backend_bound",
+        "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)",
+        "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
+        "MetricName": "tma_bad_speculation",
+        "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
+        "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots",
+        "MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
+        "MetricName": "tma_branch_mispredicts",
+        "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
+        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
+        "MetricExpr": "cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks + tma_unknown_branches",
+        "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricName": "tma_branch_resteers",
+        "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.1 power-performance optimized state (Faster wakeup time; Smaller power savings).",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C01@ / tma_info_thread_clks",
+        "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group",
+        "MetricName": "tma_c01_wait",
+        "MetricThreshold": "tma_c01_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.2 power-performance optimized state (Slower wakeup time; Larger power savings).",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C02@ / tma_info_thread_clks",
+        "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group",
+        "MetricName": "tma_c02_wait",
+        "MetricThreshold": "tma_c02_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
+        "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
+        "MetricName": "tma_cisc",
+        "MetricThreshold": "tma_cisc > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
+        "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources. Sample with: FRONTEND_RETIRED.MS_FLOWS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
+        "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks",
+        "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC",
+        "MetricName": "tma_clears_resteers",
+        "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
+        "MetricExpr": "(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@R, 24 * tma_info_system_core_frequency) + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@R, 25 * tma_info_system_core_frequency) * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
+        "MetricGroup": "BvMS;DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
+        "MetricName": "tma_contested_accesses",
+        "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS. Related metrics: tma_data_sharing, tma_false_sharing, tma_machine_clears, tma_remote_cache",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
+        "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)",
+        "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
+        "MetricName": "tma_core_bound",
+        "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
+        "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
+        "MetricExpr": "(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@R, 24 * tma_info_system_core_frequency) + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@R, 24 * tma_info_system_core_frequency) * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
+        "MetricGroup": "BvMS;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
+        "MetricName": "tma_data_sharing",
+        "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD. Related metrics: tma_contested_accesses, tma_false_sharing, tma_machine_clears, tma_remote_cache",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder",
+        "MetricExpr": "(cpu_core@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu_core@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
+        "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
+        "MetricName": "tma_decoder0_alone",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
+        "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
+        "MetricExpr": "cpu_core@ARITH.DIV_ACTIVE@ / tma_info_thread_clks",
+        "MetricGroup": "BvCB;TopdownL3;tma_L3_group;tma_core_bound_group",
+        "MetricName": "tma_divider",
+        "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
+        "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
+        "MetricExpr": "cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@ / tma_info_thread_clks",
+        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricName": "tma_dram_bound",
+        "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
+        "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
+        "MetricExpr": "(cpu_core@IDQ.DSB_CYCLES_ANY@ - cpu_core@IDQ.DSB_CYCLES_OK@) / tma_info_core_core_clks / 2",
+        "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
+        "MetricName": "tma_dsb",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
+        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
+        "MetricExpr": "cpu_core@DSB2MITE_SWITCHES.PENALTY_CYCLES@ / tma_info_thread_clks",
+        "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
+        "MetricName": "tma_dsb_switches",
+        "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
+        "MetricExpr": "cpu_core@MEM_INST_RETIRED.STLB_HIT_LOADS@ * min(cpu_core@MEM_INST_RETIRED.STLB_HIT_LOADS@R, 7) / tma_info_thread_clks + tma_load_stlb_miss",
+        "MetricGroup": "BvMT;MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
+        "MetricName": "tma_dtlb_load",
+        "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
+        "MetricExpr": "cpu_core@MEM_INST_RETIRED.STLB_HIT_STORES@ * min(cpu_core@MEM_INST_RETIRED.STLB_HIT_STORES@R, 7) / tma_info_thread_clks + tma_store_stlb_miss",
+        "MetricGroup": "BvMT;MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
+        "MetricName": "tma_dtlb_store",
+        "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
+        "MetricExpr": "28 * tma_info_system_core_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_thread_clks",
+        "MetricGroup": "BvMS;DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
+        "MetricName": "tma_false_sharing",
+        "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM. Related metrics: tma_contested_accesses, tma_data_sharing, tma_machine_clears, tma_remote_cache",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
+        "MetricExpr": "cpu_core@L1D_PEND_MISS.FB_FULL@ / tma_info_thread_clks",
+        "MetricGroup": "BvMS;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
+        "MetricName": "tma_fb_full",
+        "MetricThreshold": "tma_fb_full > 0.3",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
+        "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
+        "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
+        "MetricName": "tma_fetch_bandwidth",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
+        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
+        "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_thread_slots",
+        "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+        "MetricName": "tma_fetch_latency",
+        "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
+        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops",
+        "MetricExpr": "max(0, tma_heavy_operations - tma_microcode_sequencer)",
+        "MetricGroup": "TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueD0",
+        "MetricName": "tma_few_uops_instructions",
+        "MetricThreshold": "tma_few_uops_instructions > 0.05 & tma_heavy_operations > 0.1",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions. Related metrics: tma_decoder0_alone",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
+        "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
+        "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricName": "tma_fp_arith",
+        "MetricThreshold": "tma_fp_arith > 0.2 & tma_light_operations > 0.6",
+        "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+        "MetricExpr": "30 * cpu_core@ASSISTS.FP@ / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_fp_assists",
+        "MetricThreshold": "tma_fp_assists > 0.1",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
+        "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
+        "MetricName": "tma_fp_scalar",
+        "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
+        "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
+        "MetricName": "tma_fp_vector",
+        "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+        "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_128b",
+        "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+        "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_256b",
+        "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_thread_slots",
+        "MetricGroup": "BvFB;BvIO;PGO;TmaL1;TopdownL1;tma_L1_group",
+        "MetricName": "tma_frontend_bound",
+        "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
+        "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.MACRO_FUSED@ / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Branches;BvBO;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricName": "tma_fused_instructions",
+        "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences",
+        "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots",
+        "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
+        "MetricName": "tma_heavy_operations",
+        "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .). Sample with: UOPS_RETIRED.HEAVY",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
+        "MetricExpr": "cpu_core@ICACHE_DATA.STALLS@ / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;BvBC;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricName": "tma_icache_misses",
+        "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / 100",
+        "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
+        "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
+        "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.COND_NTAKEN@",
+        "MetricGroup": "Bad;BrMispredicts",
+        "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken",
+        "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.COND_TAKEN@",
+        "MetricGroup": "Bad;BrMispredicts",
+        "MetricName": "tma_info_bad_spec_ipmisp_cond_taken",
+        "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.INDIRECT@",
+        "MetricGroup": "Bad;BrMispredicts",
+        "MetricName": "tma_info_bad_spec_ipmisp_indirect",
+        "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.RET@",
+        "MetricGroup": "Bad;BrMispredicts",
+        "MetricName": "tma_info_bad_spec_ipmisp_ret",
+        "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@",
+        "MetricGroup": "Bad;BadSpec;BrMispredicts",
+        "MetricName": "tma_info_bad_spec_ipmispredict",
+        "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "cpu_core@INT_MISC.CLEARS_COUNT@ / (cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ + cpu_core@MACHINE_CLEARS.COUNT@)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
+        "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
+        "MetricGroup": "Cor;SMT",
+        "MetricName": "tma_info_botlnk_l0_core_bound_likely",
+        "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck",
+        "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_lsd + tma_mite)))",
+        "MetricGroup": "DSB;FetchBW;tma_issueFB",
+        "MetricName": "tma_info_botlnk_l2_dsb_bandwidth",
+        "MetricThreshold": "tma_info_botlnk_l2_dsb_bandwidth > 10",
+        "PublicDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck",
+        "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))",
+        "MetricGroup": "DSBmiss;Fed;tma_issueFB",
+        "MetricName": "tma_info_botlnk_l2_dsb_misses",
+        "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10",
+        "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck",
+        "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL",
+        "MetricName": "tma_info_botlnk_l2_ic_misses",
+        "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5",
+        "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: ",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
+        "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
+        "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB",
+        "MetricName": "tma_info_bottleneck_big_code",
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA",
+        "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@INST_RETIRED.NOP@) / tma_info_thread_slots)",
+        "MetricGroup": "BvBO;Ret",
+        "MetricName": "tma_info_bottleneck_branching_overhead",
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5",
+        "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_hit_latency / (tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "BvCB;Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))) - tma_info_bottleneck_big_code",
+        "MetricGroup": "BvFB;Fed;FetchBW;Frontend",
+        "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
+        "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY\\,umask\\=1@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_data_tlbs",
+        "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "BvMS;Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM",
+        "MetricName": "tma_info_bottleneck_mispredictions",
+        "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
+        "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_useful_work)",
+        "MetricGroup": "BvOB;Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@INST_RETIRED.NOP@) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "BvUW;Ret",
+        "MetricName": "tma_info_bottleneck_useful_work",
+        "MetricThreshold": "tma_info_bottleneck_useful_work > 20",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of branches that are CALL or RET",
+        "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
+        "MetricGroup": "Bad;Branches",
+        "MetricName": "tma_info_branches_callret",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of branches that are non-taken conditionals",
+        "MetricExpr": "cpu_core@BR_INST_RETIRED.COND_NTAKEN@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
+        "MetricGroup": "Bad;Branches;CodeGen;PGO",
+        "MetricName": "tma_info_branches_cond_nt",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of branches that are taken conditionals",
+        "MetricExpr": "cpu_core@BR_INST_RETIRED.COND_TAKEN@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
+        "MetricGroup": "Bad;Branches;CodeGen;PGO",
+        "MetricName": "tma_info_branches_cond_tk",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps",
+        "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@) / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
+        "MetricGroup": "Bad;Branches",
+        "MetricName": "tma_info_branches_jump",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)",
+        "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)",
+        "MetricGroup": "Bad;Branches",
+        "MetricName": "tma_info_branches_other_branches",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
+        "MetricExpr": "(cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@ if #SMT_on else tma_info_thread_clks)",
+        "MetricGroup": "SMT",
+        "MetricName": "tma_info_core_core_clks",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / tma_info_core_core_clks",
+        "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group",
+        "MetricName": "tma_info_core_coreipc",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_core_clks",
+        "MetricGroup": "Flops;Ret",
+        "MetricName": "tma_info_core_flopc",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
+        "MetricExpr": "(cpu_core@FP_ARITH_DISPATCHED.PORT_0@ + cpu_core@FP_ARITH_DISPATCHED.PORT_1@ + cpu_core@FP_ARITH_DISPATCHED.PORT_5@) / (2 * tma_info_core_core_clks)",
+        "MetricGroup": "Cor;Flops;HPC",
+        "MetricName": "tma_info_core_fp_arith_utilization",
+        "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common).",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
+        "MetricName": "tma_info_core_ilp",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricExpr": "cpu_core@IDQ.DSB_UOPS@ / cpu_core@UOPS_ISSUED.ANY@",
+        "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
+        "MetricName": "tma_info_frontend_dsb_coverage",
+        "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 6 > 0.35",
+        "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.",
+        "MetricExpr": "cpu_core@DSB2MITE_SWITCHES.PENALTY_CYCLES@ / cpu_core@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@",
+        "MetricGroup": "DSBmiss",
+        "MetricName": "tma_info_frontend_dsb_switch_cost",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of Uops issued by front-end when it issued something",
+        "MetricExpr": "cpu_core@UOPS_ISSUED.ANY@ / cpu_core@UOPS_ISSUED.ANY\\,cmask\\=1@",
+        "MetricGroup": "Fed;FetchBW",
+        "MetricName": "tma_info_frontend_fetch_upc",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average Latency for L1 instruction cache misses",
+        "MetricExpr": "cpu_core@ICACHE_DATA.STALLS@ / cpu_core@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@",
+        "MetricGroup": "Fed;FetchLat;IcMiss",
+        "MetricName": "tma_info_frontend_icache_miss_latency",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FRONTEND_RETIRED.ANY_DSB_MISS@",
+        "MetricGroup": "DSBmiss;Fed",
+        "MetricName": "tma_info_frontend_ipdsb_miss_ret",
+        "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / cpu_core@BACLEARS.ANY@",
+        "MetricGroup": "Fed",
+        "MetricName": "tma_info_frontend_ipunknown_branch",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L2 cache true code cacheline misses per kilo instruction",
+        "MetricExpr": "1e3 * cpu_core@FRONTEND_RETIRED.L2_MISS@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "IcMiss",
+        "MetricName": "tma_info_frontend_l2mpki_code",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction",
+        "MetricExpr": "1e3 * cpu_core@L2_RQSTS.CODE_RD_MISS@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "IcMiss",
+        "MetricName": "tma_info_frontend_l2mpki_code_all",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)",
+        "MetricExpr": "cpu_core@LSD.UOPS@ / cpu_core@UOPS_ISSUED.ANY@",
+        "MetricGroup": "Fed;LSD",
+        "MetricName": "tma_info_frontend_lsd_coverage",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection",
+        "MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES\\,cmask\\=1\\,edge@",
+        "MetricGroup": "Fed",
+        "MetricName": "tma_info_frontend_unknown_branch_cost",
+        "PublicDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection. See Unknown_Branches node.",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Branch instructions per taken branch.",
+        "MetricExpr": "cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@",
+        "MetricGroup": "Branches;Fed;PGO",
+        "MetricName": "tma_info_inst_mix_bptkbranch",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Summary;TmaL1;tma_L1_group",
+        "MetricName": "tma_info_inst_mix_instructions",
+        "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + cpu_core@FP_ARITH_INST_RETIRED.VECTOR@)",
+        "MetricGroup": "Flops;InsType",
+        "MetricName": "tma_info_inst_mix_iparith",
+        "MetricThreshold": "tma_info_inst_mix_iparith < 10",
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@)",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_iparith_avx128",
+        "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_iparith_avx256",
+        "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@",
+        "MetricGroup": "Flops;FpScalar;InsType",
+        "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
+        "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@",
+        "MetricGroup": "Flops;FpScalar;InsType",
+        "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
+        "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
+        "MetricGroup": "Branches;Fed;InsType",
+        "MetricName": "tma_info_inst_mix_ipbranch",
+        "MetricThreshold": "tma_info_inst_mix_ipbranch < 8",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.NEAR_CALL@",
+        "MetricGroup": "Branches;Fed;PGO",
+        "MetricName": "tma_info_inst_mix_ipcall",
+        "MetricThreshold": "tma_info_inst_mix_ipcall < 200",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)",
+        "MetricGroup": "Flops;InsType",
+        "MetricName": "tma_info_inst_mix_ipflop",
+        "MetricThreshold": "tma_info_inst_mix_ipflop < 10",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@MEM_INST_RETIRED.ALL_LOADS@",
+        "MetricGroup": "InsType",
+        "MetricName": "tma_info_inst_mix_ipload",
+        "MetricThreshold": "tma_info_inst_mix_ipload < 3",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / cpu_core@CPU_CLK_UNHALTED.PAUSE_INST@",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_ippause",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@",
+        "MetricGroup": "InsType",
+        "MetricName": "tma_info_inst_mix_ipstore",
+        "MetricThreshold": "tma_info_inst_mix_ipstore < 8",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@",
+        "MetricGroup": "Prefetches",
+        "MetricName": "tma_info_inst_mix_ipswpf",
+        "MetricThreshold": "tma_info_inst_mix_ipswpf < 100",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per taken branch",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@",
+        "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB",
+        "MetricName": "tma_info_inst_mix_iptb",
+        "MetricThreshold": "tma_info_inst_mix_iptb < 13",
+        "PublicDescription": "Instructions per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
+        "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "CacheHits;Mem",
+        "MetricName": "tma_info_memory_fb_hpki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "CacheHits;Mem",
+        "MetricName": "tma_info_memory_l1mpki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
+        "MetricExpr": "1e3 * cpu_core@L2_RQSTS.ALL_DEMAND_DATA_RD@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "CacheHits;Mem",
+        "MetricName": "tma_info_memory_l1mpki_load",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
+        "MetricExpr": "1e3 * (cpu_core@L2_RQSTS.REFERENCES@ - cpu_core@L2_RQSTS.MISS@) / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "CacheHits;Mem",
+        "MetricName": "tma_info_memory_l2hpki_all",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
+        "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_HIT@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "CacheHits;Mem",
+        "MetricName": "tma_info_memory_l2hpki_load",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L2_MISS@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Backend;CacheHits;Mem",
+        "MetricName": "tma_info_memory_l2mpki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
+        "MetricExpr": "1e3 * cpu_core@L2_RQSTS.MISS@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "CacheHits;Mem;Offcore",
+        "MetricName": "tma_info_memory_l2mpki_all",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
+        "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_MISS@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "CacheHits;Mem",
+        "MetricName": "tma_info_memory_l2mpki_load",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Offcore requests (L2 cache miss) per kilo instruction for demand RFOs",
+        "MetricExpr": "1e3 * cpu_core@L2_RQSTS.RFO_MISS@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "CacheMisses;Offcore",
+        "MetricName": "tma_info_memory_l2mpki_rfo",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DATA_RD@ / cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@",
+        "MetricGroup": "Memory_BW;Offcore",
+        "MetricName": "tma_info_memory_latency_data_l2_mlp",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
+        "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS.DEMAND_DATA_RD@",
+        "MetricGroup": "Memory_Lat;Offcore",
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
+        "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
+        "MetricGroup": "Memory_BW;Offcore",
+        "MetricName": "tma_info_memory_latency_load_l2_mlp",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average Latency for L3 cache miss demand Loads",
+        "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD@",
+        "MetricGroup": "Memory_Lat;Offcore",
+        "MetricName": "tma_info_memory_latency_load_l3_miss_latency",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / cpu_core@MEM_LOAD_COMPLETED.L1_MISS_ANY@",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "1e3 * cpu_core@SQ_MISC.BUS_LOCK@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_bus_lock_pki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * cpu_core@MEM_LOAD_MISC_RETIRED.UC@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / cpu_core@L1D_PEND_MISS.PENDING_CYCLES@",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "1e3 * cpu_core@ITLB_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Fed;MemoryTLB",
+        "MetricName": "tma_info_memory_tlb_code_stlb_mpki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "1e3 * cpu_core@DTLB_LOAD_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Mem;MemoryTLB",
+        "MetricName": "tma_info_memory_tlb_load_stlb_mpki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricExpr": "(cpu_core@ITLB_MISSES.WALK_PENDING@ + cpu_core@DTLB_LOAD_MISSES.WALK_PENDING@ + cpu_core@DTLB_STORE_MISSES.WALK_PENDING@) / (4 * tma_info_core_core_clks)",
+        "MetricGroup": "Mem;MemoryTLB",
+        "MetricName": "tma_info_memory_tlb_page_walks_utilization",
+        "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "1e3 * cpu_core@DTLB_STORE_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Mem;MemoryTLB",
+        "MetricName": "tma_info_memory_tlb_store_stlb_mpki",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per core",
+        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
+        "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
+        "MetricName": "tma_info_pipeline_execute",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of uops fetched from DSB per cycle",
+        "MetricExpr": "cpu_core@IDQ.DSB_UOPS@ / cpu_core@IDQ.DSB_CYCLES_ANY@",
+        "MetricGroup": "Fed;FetchBW",
+        "MetricName": "tma_info_pipeline_fetch_dsb",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of uops fetched from LSD per cycle",
+        "MetricExpr": "cpu_core@LSD.UOPS@ / cpu_core@LSD.CYCLES_ACTIVE@",
+        "MetricGroup": "Fed;FetchBW",
+        "MetricName": "tma_info_pipeline_fetch_lsd",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of uops fetched from MITE per cycle",
+        "MetricExpr": "cpu_core@IDQ.MITE_UOPS@ / cpu_core@IDQ.MITE_CYCLES_ANY@",
+        "MetricGroup": "Fed;FetchBW",
+        "MetricName": "tma_info_pipeline_fetch_mite",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per a microcode Assist invocation",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@ASSISTS.ANY@",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
+        "MetricName": "tma_info_pipeline_ipassist",
+        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
+        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
+        "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
+        "MetricGroup": "Pipeline;Ret",
+        "MetricName": "tma_info_pipeline_retire",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions",
+        "MetricExpr": "cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
+        "MetricGroup": "MicroSeq;Pipeline;Ret",
+        "MetricName": "tma_info_pipeline_strings_cycles",
+        "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of cycles the processor is waiting yet unhalted; covering legacy PAUSE instruction, as well as C0.1 / C0.2 power-performance optimized states",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C0_WAIT@ / tma_info_thread_clks",
+        "MetricGroup": "C0Wait",
+        "MetricName": "tma_info_system_c0_wait",
+        "MetricThreshold": "tma_info_system_c0_wait > 0.05",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
+        "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
+        "MetricGroup": "Power;Summary",
+        "MetricName": "tma_info_system_core_frequency",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average CPU Utilization (percentage)",
+        "MetricExpr": "tma_info_system_cpus_utilized / #num_cpus_online",
+        "MetricGroup": "HPC;Summary",
+        "MetricName": "tma_info_system_cpu_utilization",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricExpr": "64 * (UNC_HAC_ARB_TRK_REQUESTS.ALL + UNC_HAC_ARB_COH_TRK_REQUESTS.ALL) / 1e9 / duration_time",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
+        "MetricName": "tma_info_system_dram_bw_use",
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Giga Floating Point Operations Per Second",
+        "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time",
+        "MetricGroup": "Cor;Flops;HPC",
+        "MetricName": "tma_info_system_gflops",
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.FAR_BRANCH@u",
+        "MetricGroup": "Branches;OS",
+        "MetricName": "tma_info_system_ipfarbranch",
+        "MetricThreshold": "tma_info_system_ipfarbranch < 1e6",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD_P@k / cpu_core@INST_RETIRED.ANY_P@k",
+        "MetricGroup": "OS",
+        "MetricName": "tma_info_system_kernel_cpi",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD_P@k / cpu_core@CPU_CLK_UNHALTED.THREAD@",
+        "MetricGroup": "OS",
+        "MetricName": "tma_info_system_kernel_utilization",
+        "MetricThreshold": "tma_info_system_kernel_utilization > 0.05",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average number of parallel data read requests to external memory",
+        "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@",
+        "MetricGroup": "Mem;MemoryBW;SoC",
+        "MetricName": "tma_info_system_mem_parallel_reads",
+        "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
+        "MetricExpr": "(1 - cpu_core@CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE@ / cpu_core@CPU_CLK_UNHALTED.REF_DISTRIBUTED@ if #SMT_on else 0)",
+        "MetricGroup": "SMT",
+        "MetricName": "tma_info_system_smt_2t_utilization",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Socket actual clocks when any core is active on that socket",
+        "MetricExpr": "UNC_CLOCK.SOCKET",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_socket_clks",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
+        "MetricExpr": "tma_info_thread_clks / cpu_core@CPU_CLK_UNHALTED.REF_TSC@",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_system_turbo_utilization",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@",
+        "MetricGroup": "Pipeline",
+        "MetricName": "tma_info_thread_clks",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
+        "MetricExpr": "1 / tma_info_thread_ipc",
+        "MetricGroup": "Mem;Pipeline",
+        "MetricName": "tma_info_thread_cpi",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "The ratio of Executed- by Issued-Uops",
+        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_ISSUED.ANY@",
+        "MetricGroup": "Cor;Pipeline",
+        "MetricName": "tma_info_thread_execute_per_issue",
+        "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage.",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / tma_info_thread_clks",
+        "MetricGroup": "Ret;Summary",
+        "MetricName": "tma_info_thread_ipc",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
+        "MetricExpr": "cpu_core@TOPDOWN.SLOTS@",
+        "MetricGroup": "TmaL1;tma_L1_group",
+        "MetricName": "tma_info_thread_slots",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor",
+        "MetricExpr": "(tma_info_thread_slots / (cpu_core@TOPDOWN.SLOTS@ / 2) if #SMT_on else 1)",
+        "MetricGroup": "SMT;TmaL1;tma_L1_group",
+        "MetricName": "tma_info_thread_slots_utilization",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Uops Per Instruction",
+        "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@INST_RETIRED.ANY@",
+        "MetricGroup": "Pipeline;Ret;Retire",
+        "MetricName": "tma_info_thread_uoppi",
+        "MetricThreshold": "tma_info_thread_uoppi > 1.05",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Uops per taken branch",
+        "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@",
+        "MetricGroup": "Branches;Fed;FetchBW",
+        "MetricName": "tma_info_thread_uptb",
+        "MetricThreshold": "tma_info_thread_uptb < 9",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)",
+        "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b",
+        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricName": "tma_int_operations",
+        "MetricThreshold": "tma_int_operations > 0.1 & tma_light_operations > 0.6",
+        "PublicDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired). Vector/Matrix Int operations and shuffles are counted. Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired",
+        "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@) / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P",
+        "MetricName": "tma_int_vector_128b",
+        "MetricThreshold": "tma_int_vector_128b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired",
+        "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P",
+        "MetricName": "tma_int_vector_256b",
+        "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
+        "MetricExpr": "cpu_core@ICACHE_TAG.STALLS@ / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;BvBC;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricName": "tma_itlb_misses",
+        "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
+        "MetricExpr": "max((cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@) / tma_info_thread_clks, 0)",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricName": "tma_l1_bound",
+        "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
+        "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates fraction of cycles with demand load accesses that hit the L1 cache",
+        "MetricExpr": "min(2 * (cpu_core@MEM_INST_RETIRED.ALL_LOADS@ - cpu_core@MEM_LOAD_RETIRED.FB_HIT@ - cpu_core@MEM_LOAD_RETIRED.L1_MISS@) * 20 / 100, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_thread_clks",
+        "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group",
+        "MetricName": "tma_l1_hit_latency",
+        "MetricThreshold": "tma_l1_hit_latency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric roughly estimates fraction of cycles with demand load accesses that hit the L1 cache. The short latency of the L1 data cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
+        "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@) / tma_info_thread_clks",
+        "MetricGroup": "BvML;CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricName": "tma_l2_bound",
+        "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
+        "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
+        "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@) / tma_info_thread_clks",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricName": "tma_l3_bound",
+        "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
+        "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * min(cpu_core@MEM_LOAD_RETIRED.L3_HIT@R, 9 * tma_info_system_core_frequency) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
+        "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
+        "MetricName": "tma_l3_hit_latency",
+        "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
+        "MetricExpr": "cpu_core@DECODE.LCP@ / tma_info_thread_clks",
+        "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
+        "MetricName": "tma_lcp",
+        "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
+        "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
+        "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)",
+        "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
+        "MetricName": "tma_light_operations",
+        "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations",
+        "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_2_3_10@ / (3 * tma_info_core_core_clks)",
+        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
+        "MetricName": "tma_load_op_utilization",
+        "MetricThreshold": "tma_load_op_utilization > 0.6",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations. Sample with: UOPS_DISPATCHED.PORT_2_3_10",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)",
+        "MetricExpr": "max(0, tma_dtlb_load - tma_load_stlb_miss)",
+        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group",
+        "MetricName": "tma_load_stlb_hit",
+        "MetricThreshold": "tma_load_stlb_hit > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk",
+        "MetricExpr": "cpu_core@DTLB_LOAD_MISSES.WALK_ACTIVE@ / tma_info_thread_clks",
+        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group",
+        "MetricName": "tma_load_stlb_miss",
+        "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
+        "MetricExpr": "cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ * cpu_core@MEM_INST_RETIRED.LOCK_LOADS@R / tma_info_thread_clks",
+        "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group",
+        "MetricName": "tma_lock_latency",
+        "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS. Related metrics: tma_store_latency",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit",
+        "MetricExpr": "(cpu_core@LSD.CYCLES_ACTIVE@ - cpu_core@LSD.CYCLES_OK@) / tma_info_core_core_clks / 2",
+        "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
+        "MetricName": "tma_lsd",
+        "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2",
+        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
+        "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)",
+        "MetricGroup": "BadSpec;BvMS;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
+        "MetricName": "tma_machine_clears",
+        "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
+        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
+        "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
+        "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
+        "MetricName": "tma_mem_bandwidth",
+        "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
+        "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@) / tma_info_thread_clks - tma_mem_bandwidth",
+        "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
+        "MetricName": "tma_mem_latency",
+        "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
+        "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots",
+        "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
+        "MetricName": "tma_memory_bound",
+        "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
+        "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.",
+        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
+        "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
+        "MetricName": "tma_memory_fence",
+        "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.",
+        "MetricExpr": "tma_light_operations * cpu_core@MEM_UOP_RETIRED.ANY@ / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricName": "tma_memory_operations",
+        "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
+        "MetricExpr": "cpu_core@UOPS_RETIRED.MS@ / tma_info_thread_slots",
+        "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
+        "MetricName": "tma_microcode_sequencer",
+        "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
+        "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks",
+        "MetricGroup": "BadSpec;BrMispredicts;BvMP;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM",
+        "MetricName": "tma_mispredicts_resteers",
+        "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
+        "MetricExpr": "(cpu_core@IDQ.MITE_CYCLES_ANY@ - cpu_core@IDQ.MITE_CYCLES_OK@) / tma_info_core_core_clks / 2",
+        "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
+        "MetricName": "tma_mite",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
+        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
+        "MetricExpr": "160 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_thread_clks",
+        "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
+        "MetricName": "tma_mixing_vectors",
+        "MetricThreshold": "tma_mixing_vectors > 0.05",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
+        "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (cpu_core@UOPS_RETIRED.SLOTS@ / cpu_core@UOPS_ISSUED.ANY@) / tma_info_thread_clks",
+        "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
+        "MetricName": "tma_ms_switches",
+        "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
+        "MetricExpr": "tma_light_operations * (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ - cpu_core@INST_RETIRED.MACRO_FUSED@) / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "Branches;BvBO;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricName": "tma_non_fused_branches",
+        "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
+        "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.NOP@ / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "BvBO;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
+        "MetricName": "tma_nop_instructions",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
+        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricName": "tma_other_light_ops",
+        "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
+        "PublicDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / (cpu_core@INT_MISC.CLEARS_COUNT@ - cpu_core@MACHINE_CLEARS.COUNT@)), 0.0001)",
+        "MetricGroup": "BrMispredicts;BvIO;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - cpu_core@MACHINE_CLEARS.MEMORY_ORDERING@ / cpu_core@MACHINE_CLEARS.COUNT@), 0.0001)",
+        "MetricGroup": "BvIO;Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults",
+        "MetricExpr": "99 * cpu_core@ASSISTS.PAGE_FAULT@ / tma_info_thread_slots",
+        "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_page_faults",
+        "MetricThreshold": "tma_page_faults > 0.05",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults. A Page Fault may apply on first application access to a memory page. Note operating system handling of page faults accounts for the majority of its cost.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
+        "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_0@ / tma_info_core_core_clks",
+        "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
+        "MetricName": "tma_port_0",
+        "MetricThreshold": "tma_port_0 > 0.6",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)",
+        "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_1@ / tma_info_core_core_clks",
+        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
+        "MetricName": "tma_port_1",
+        "MetricThreshold": "tma_port_1 > 0.6",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
+        "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_6@ / tma_info_core_core_clks",
+        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
+        "MetricName": "tma_port_6",
+        "MetricThreshold": "tma_port_6 > 0.6",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
+        "MetricName": "tma_ports_utilization",
+        "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
+        "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related).  Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+        "MetricExpr": "max((cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ + max(cpu_core@RS.EMPTY_RESOURCE@ - cpu_core@RESOURCE_STALLS.SCOREBOARD@, 0)) / tma_info_thread_clks, 1) * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks",
+        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
+        "MetricName": "tma_ports_utilized_0",
+        "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+        "MetricExpr": "cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ / tma_info_thread_clks",
+        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group",
+        "MetricName": "tma_ports_utilized_1",
+        "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL. Related metrics: tma_l1_bound",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
+        "MetricExpr": "cpu_core@EXE_ACTIVITY.2_PORTS_UTIL@ / tma_info_thread_clks",
+        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
+        "MetricName": "tma_ports_utilized_2",
+        "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
+        "MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks",
+        "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
+        "MetricName": "tma_ports_utilized_3m",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots",
+        "MetricGroup": "BvUW;TmaL1;TopdownL1;tma_L1_group",
+        "MetricName": "tma_retiring",
+        "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
+        "MetricExpr": "cpu_core@RESOURCE_STALLS.SCOREBOARD@ / tma_info_thread_clks + tma_c02_wait",
+        "MetricGroup": "BvIO;PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
+        "MetricName": "tma_serializing_operation",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
+        "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer)",
+        "MetricExpr": "tma_light_operations * cpu_core@INT_VEC_RETIRED.SHUFFLES@ / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
+        "MetricName": "tma_shuffles_256b",
+        "MetricThreshold": "tma_shuffles_256b > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer). Shuffles may incur slow cross \"vector lane\" data transfers.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
+        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.PAUSE@ / tma_info_thread_clks",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
+        "MetricName": "tma_slow_pause",
+        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
+        "MetricExpr": "cpu_core@MEM_INST_RETIRED.SPLIT_LOADS@ * min(cpu_core@MEM_INST_RETIRED.SPLIT_LOADS@R, tma_info_memory_load_miss_real_latency) / tma_info_thread_clks",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
+        "MetricName": "tma_split_loads",
+        "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents rate of split store accesses",
+        "MetricExpr": "cpu_core@MEM_INST_RETIRED.SPLIT_STORES@ * min(cpu_core@MEM_INST_RETIRED.SPLIT_STORES@R, 1) / tma_info_thread_clks",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
+        "MetricName": "tma_split_stores",
+        "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric represents rate of split store accesses.  Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS. Related metrics: tma_port_4",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
+        "MetricExpr": "(cpu_core@XQ.FULL_CYCLES@ + cpu_core@L1D_PEND_MISS.L2_STALLS@) / tma_info_thread_clks",
+        "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
+        "MetricName": "tma_sq_full",
+        "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
+        "MetricExpr": "cpu_core@EXE_ACTIVITY.BOUND_ON_STORES@ / tma_info_thread_clks",
+        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricName": "tma_store_bound",
+        "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
+        "PublicDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
+        "MetricExpr": "13 * cpu_core@LD_BLOCKS.STORE_FORWARD@ / tma_info_thread_clks",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
+        "MetricName": "tma_store_fwd_blk",
+        "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
+        "MetricExpr": "(cpu_core@MEM_STORE_RETIRED.L2_HIT@ * 10 * (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) + (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) * min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@)) / tma_info_thread_clks",
+        "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group",
+        "MetricName": "tma_store_latency",
+        "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full). Related metrics: tma_fb_full, tma_lock_latency",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
+        "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_4_9@ + cpu_core@UOPS_DISPATCHED.PORT_7_8@) / (4 * tma_info_core_core_clks)",
+        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
+        "MetricName": "tma_store_op_utilization",
+        "MetricThreshold": "tma_store_op_utilization > 0.6",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations. Sample with: UOPS_DISPATCHED.PORT_7_8",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)",
+        "MetricExpr": "max(0, tma_dtlb_store - tma_store_stlb_miss)",
+        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group",
+        "MetricName": "tma_store_stlb_hit",
+        "MetricThreshold": "tma_store_stlb_hit > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk",
+        "MetricExpr": "cpu_core@DTLB_STORE_MISSES.WALK_ACTIVE@ / tma_info_core_core_clks",
+        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group",
+        "MetricName": "tma_store_stlb_miss",
+        "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric estimates how often CPU was stalled  due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores",
+        "MetricExpr": "9 * cpu_core@OCR.STREAMING_WR.ANY_RESPONSE@ / tma_info_thread_clks",
+        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group",
+        "MetricName": "tma_streaming_stores",
+        "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+        "PublicDescription": "This metric estimates how often CPU was stalled  due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE. Related metrics: tma_fb_full",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
+        "MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;BvBC;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricName": "tma_unknown_branches",
+        "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
+        "MetricExpr": "tma_retiring * cpu_core@UOPS_EXECUTED.X87@ / cpu_core@UOPS_EXECUTED.THREAD@",
+        "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
+        "MetricName": "tma_x87_use",
+        "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    }
+]
-- 
cgit v1.2.3


From d546e3acf3526eaf1d74902236c7219a424ac2e9 Mon Sep 17 00:00:00 2001
From: Weilin Wang <weilin.wang@intel.com>
Date: Sat, 20 Jul 2024 02:20:59 -0400
Subject: perf stat: Add command line option for enabling TPEBS recording

With this command line option, TPEBS recording is turned off in 'perf
stat' on default. It will only be turned on when this option is given in
'perf stat' command.

Example with --record-tpebs:

  perf stat -M tma_split_loads -C1-4 --record-tpebs sleep 1

  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 0.044 MB - ]

   Performance counter stats for 'CPU(s) 1-4':

      53,259,156,071      cpu_core/TOPDOWN.SLOTS/          #      1.6 %  tma_split_loads   (50.00%)
      15,867,565,250      cpu_core/topdown-retiring/                                       (50.00%)
      15,655,580,731      cpu_core/topdown-mem-bound/                                      (50.00%)
      11,738,022,218      cpu_core/topdown-bad-spec/                                       (50.00%)
       6,151,265,424      cpu_core/topdown-fe-bound/                                       (50.00%)
      20,445,917,581      cpu_core/topdown-be-bound/                                       (50.00%)
       6,925,098,013      cpu_core/L1D_PEND_MISS.PENDING/                                  (50.00%)
       3,838,653,421      cpu_core/MEMORY_ACTIVITY.STALLS_L1D_MISS/                        (50.00%)
       4,797,059,783      cpu_core/EXE_ACTIVITY.BOUND_ON_LOADS/                            (50.00%)
      11,931,916,714      cpu_core/CPU_CLK_UNHALTED.THREAD/                                (50.00%)
         102,576,164      cpu_core/MEM_LOAD_COMPLETED.L1_MISS_ANY/                         (50.00%)
          64,071,854      cpu_core/MEM_INST_RETIRED.SPLIT_LOADS/                           (50.00%)
                   3      cpu_core/MEM_INST_RETIRED.SPLIT_LOADS/R

         1.003049679 seconds time elapsed

Example without --record-tpebs:

  perf stat -M tma_contested_accesses -C1 sleep 1

   Performance counter stats for 'CPU(s) 1':

          50,203,891      cpu_core/TOPDOWN.SLOTS/          #      0.0 %  tma_contested_accesses   (63.60%)
          10,040,777      cpu_core/topdown-retiring/                                              (63.60%)
           6,890,729      cpu_core/topdown-mem-bound/                                             (63.60%)
           2,756,463      cpu_core/topdown-bad-spec/                                              (63.60%)
          10,828,288      cpu_core/topdown-fe-bound/                                              (63.60%)
          28,350,432      cpu_core/topdown-be-bound/                                              (63.60%)
                  98      cpu_core/OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM/                          (63.70%)
             577,520      cpu_core/MEMORY_ACTIVITY.STALLS_L2_MISS/                                (54.62%)
             313,339      cpu_core/MEMORY_ACTIVITY.STALLS_L3_MISS/                                (54.62%)
              14,155      cpu_core/MEM_LOAD_RETIRED.L1_MISS/                                      (45.54%)
                   0      cpu_core/OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD/                  (36.30%)
           8,468,077      cpu_core/CPU_CLK_UNHALTED.THREAD/                                       (45.38%)
                 198      cpu_core/MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS/                             (45.38%)
               8,324      cpu_core/MEM_LOAD_RETIRED.FB_HIT/                                       (45.38%)
       3,388,031,520      TSC
          23,226,785      cpu_core/CPU_CLK_UNHALTED.REF_TSC/                                      (54.46%)
                  80      cpu_core/MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD/                              (54.46%)
                   0      cpu_core/MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD/R
                   0      cpu_core/MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS/R
       1,006,816,667 ns   duration_time

         1.002537737 seconds time elapsed

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samantha Alt <samantha.alt@intel.com>
Link: https://lore.kernel.org/r/20240720062102.444578-7-weilin.wang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt | 8 ++++++++
 tools/perf/builtin-stat.c              | 4 ++++
 2 files changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 29756a87ab6f..2bc063672486 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -498,6 +498,14 @@ To interpret the results it is usually needed to know on which
 CPUs the workload runs on. If needed the CPUs can be forced using
 taskset.
 
+--record-tpebs::
+Enable automatic sampling on Intel TPEBS retire_latency events (event with :R
+modifier). Without this option, perf would not capture dynamic retire_latency
+at runtime. Currently, a zero value is assigned to the retire_latency event when
+this option is not set. The TPEBS hardware feature starts from Intel Granite
+Rapids microarchitecture. This option only exists in X86_64 and is meaningful on
+Intel platforms with TPEBS feature.
+
 --td-level::
 Print the top-down statistics that equal the input level. It allows
 users to print the interested top-down metrics level instead of the
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6162b98fc941..cf985cdb9a6e 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -2475,6 +2475,10 @@ int cmd_stat(int argc, const char **argv)
 			"disable adding events for the metric threshold calculation"),
 		OPT_BOOLEAN(0, "topdown", &topdown_run,
 			"measure top-down statistics"),
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+		OPT_BOOLEAN(0, "record-tpebs", &tpebs_recording,
+			"enable recording for tpebs when retire_latency required"),
+#endif
 		OPT_UINTEGER(0, "td-level", &stat_config.topdown_level,
 			"Set the metrics level for the top-down statistics (0: max level)"),
 		OPT_BOOLEAN(0, "smi-cost", &smi_cost,
-- 
cgit v1.2.3


From 169f18fd980cea90804bd8bc5a5614335fa0c8ae Mon Sep 17 00:00:00 2001
From: Weilin Wang <weilin.wang@intel.com>
Date: Sat, 20 Jul 2024 02:21:00 -0400
Subject: perf Document: Add TPEBS (Timed PEBS(Precise Event-Based Sampling))
 to Documents

TPEBS (Timed PEBS(Precise Event-Based Sampling)) is a new feature Intel
PMU from Granite Rapids microarchitecture.

It will be used in new TMA (Top-Down Microarchitecture Analysis)
releases.

Add related introduction to documents while adding new code to support
it in 'perf stat'.

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samantha Alt <samantha.alt@intel.com>
Link: https://lore.kernel.org/r/20240720062102.444578-8-weilin.wang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-list.txt |  1 +
 tools/perf/Documentation/topdown.txt   | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 6bf2468f59d3..dea005410ec0 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -72,6 +72,7 @@ counted. The following modifiers exist:
  W - group is weak and will fallback to non-group if not schedulable,
  e - group or event are exclusive and do not share the PMU
  b - use BPF aggregration (see perf stat --bpf-counters)
+ R - retire latency value of the event
 
 The 'p' modifier can be used for specifying how precise the instruction
 address should be. The 'p' modifier can be specified multiple times:
diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt
index ae0aee86844f..5c17fff694ee 100644
--- a/tools/perf/Documentation/topdown.txt
+++ b/tools/perf/Documentation/topdown.txt
@@ -325,6 +325,36 @@ other four level 2 metrics by subtracting corresponding metrics as below.
     Fetch_Bandwidth = Frontend_Bound - Fetch_Latency
     Core_Bound = Backend_Bound - Memory_Bound
 
+TPEBS in TopDown
+================
+
+TPEBS (Timed PEBS) is one of the new Intel PMU features provided since Granite
+Rapids microarchitecture. The TPEBS feature adds a 16 bit retire_latency field
+in the Basic Info group of the PEBS record. It records the Core cycles since the
+retirement of the previous instruction to the retirement of current instruction.
+Please refer to Section 8.4.1 of "Intel® Architecture Instruction Set Extensions
+Programming Reference" for more details about this feature. Because this feature
+extends PEBS record, sampling with weight option is required to get the
+retire_latency value.
+
+	perf record -e event_name -W ...
+
+In the most recent release of TMA, the metrics begin to use event retire_latency
+values in some of the metrics’ formulas on processors that support TPEBS feature.
+For previous generations that do not support TPEBS, the values are static and
+predefined per processor family by the hardware architects. Due to the diversity
+of workloads in execution environments, retire_latency values measured at real
+time are more accurate. Therefore, new TMA metrics that use TPEBS will provide
+more accurate performance analysis results.
+
+To support TPEBS in TMA metrics, a new modifier :R on event is added. Perf would
+capture retire_latency value of required events(event with :R in metric formula)
+with perf record. The retire_latency value would be used in metric calculation.
+Currently, this feature is supported through perf stat
+
+	perf stat -M metric_name --record-tpebs ...
+
+
 
 [1] https://software.intel.com/en-us/top-down-microarchitecture-analysis-method-win
 [2] https://sites.google.com/site/analysismethods/yasin-pubs
-- 
cgit v1.2.3


From b2738fda24543777a623a7d1cc2a9985ab81b448 Mon Sep 17 00:00:00 2001
From: Weilin Wang <weilin.wang@intel.com>
Date: Sat, 20 Jul 2024 02:21:01 -0400
Subject: perf test: Add test for Intel TPEBS counting mode

Intel TPEBS sampling mode is supported through perf record. The counting mode
code uses perf record to capture retire_latency value and use it in metric
calculation. This test checks the counting mode code on Intel platforms.

Committer testing:

  root@x1:~# perf test tpebs
  123: test Intel TPEBS counting mode                                  : Ok
  root@x1:~# set -o vi
  root@x1:~# perf test tpebs
  123: test Intel TPEBS counting mode                                  : Ok
  root@x1:~# perf test -v tpebs
  123: test Intel TPEBS counting mode                                  : Ok
  root@x1:~# perf test -vvv tpebs
  123: test Intel TPEBS counting mode:
  --- start ---
  test child forked, pid 16603
  Testing without --record-tpebs
  Testing with --record-tpebs
  ---- end(0) ----
  123: test Intel TPEBS counting mode                                  : Ok
  root@x1:~#

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samantha Alt <samantha.alt@intel.com>
Link: https://lore.kernel.org/r/20240720062102.444578-9-weilin.wang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/test_stat_intel_tpebs.sh | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100755 tools/perf/tests/shell/test_stat_intel_tpebs.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/test_stat_intel_tpebs.sh b/tools/perf/tests/shell/test_stat_intel_tpebs.sh
new file mode 100755
index 000000000000..c60b29add980
--- /dev/null
+++ b/tools/perf/tests/shell/test_stat_intel_tpebs.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# test Intel TPEBS counting mode
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+grep -q GenuineIntel /proc/cpuinfo || { echo Skipping non-Intel; exit 2; }
+
+# Use this event for testing because it should exist in all platforms
+event=cache-misses:R
+
+# Without this cmd option, default value or zero is returned
+echo "Testing without --record-tpebs"
+result=$(perf stat -e "$event" true 2>&1)
+[[ "$result" =~ $event ]] || exit 1
+
+# In platforms that do not support TPEBS, it should execute without error.
+echo "Testing with --record-tpebs"
+result=$(perf stat -e "$event" --record-tpebs -a sleep 0.01 2>&1)
+[[ "$result" =~ "perf record" && "$result" =~ $event ]] || exit 1
-- 
cgit v1.2.3


From 1a9d080d19c3303569614ed4c3b43a39aacd90d7 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 15:41:19 -0700
Subject: perf callchain: Add a for_each callback style API

Add a for_each callback style API to callchain with
sample__for_each_callchain_node().

Possibly in the future such an API can avoid the overhead of
constructing the call chain list.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: https://lore.kernel.org/r/20240812224119.744968-1-irogers@google.com
[ Split from a larger patch that introduced the API and use it ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/callchain.c | 35 +++++++++++++++++++++++++++++++++++
 tools/perf/util/callchain.h |  6 ++++++
 2 files changed, 41 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 6d075648d2cc..0d608e875fe9 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1797,3 +1797,38 @@ s64 callchain_avg_cycles(struct callchain_node *cnode)
 
 	return cycles;
 }
+
+int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
+				    struct perf_sample *sample, int max_stack,
+				    callchain_iter_fn cb, void *data)
+{
+	struct callchain_cursor *cursor = get_tls_callchain_cursor();
+	int ret;
+
+	if (!cursor)
+		return -ENOMEM;
+
+	/* Fill in the callchain. */
+	ret = thread__resolve_callchain(thread, cursor, evsel, sample,
+					/*parent=*/NULL, /*root_al=*/NULL,
+					max_stack);
+	if (ret)
+		return ret;
+
+	/* Switch from writing the callchain to reading it. */
+	callchain_cursor_commit(cursor);
+
+	while (1) {
+		struct callchain_cursor_node *node = callchain_cursor_current(cursor);
+
+		if (!node)
+			break;
+
+		ret = cb(node, data);
+		if (ret)
+			return ret;
+
+		callchain_cursor_advance(cursor);
+	}
+	return 0;
+}
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index d5c66345ae31..76891f8e2373 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -311,4 +311,10 @@ u64 callchain_total_hits(struct hists *hists);
 
 s64 callchain_avg_cycles(struct callchain_node *cnode);
 
+typedef int (*callchain_iter_fn)(struct callchain_cursor_node *node, void *data);
+
+int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
+				    struct perf_sample *sample, int max_stack,
+				    callchain_iter_fn cb, void *data);
+
 #endif	/* __PERF_CALLCHAIN_H */
-- 
cgit v1.2.3


From 3d557dd3f54e329556e880129d0181988893b00f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 15:41:19 -0700
Subject: perf inject: Inject build ids for entire call chain

The DSO build id is injected when the dso is first encountered but the
checking for first encountered only looks at the sample->ip not the
entire callchain.

Use the callchain logic to ensure all build ids are inserted.

Fixes: 454c407ec17a0c63 ("perf: add perf-inject builtin")
Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: https://lore.kernel.org/r/20240812224119.744968-1-irogers@google.com
[ Split from a larger patch that introduced the API and use it ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index ef9cba173dd2..a35bde3f3c09 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -743,6 +743,29 @@ static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool,
 	return 0;
 }
 
+struct mark_dso_hit_args {
+	const struct perf_tool *tool;
+	struct machine *machine;
+	u8 cpumode;
+};
+
+static int mark_dso_hit_callback(struct callchain_cursor_node *node, void *data)
+{
+	struct mark_dso_hit_args *args = data;
+	struct map *map = node->ms.map;
+
+	if (map) {
+		struct dso *dso = map__dso(map);
+
+		if (dso && !dso__hit(dso)) {
+			dso__set_hit(dso);
+			dso__inject_build_id(dso, args->tool, args->machine,
+					     args->cpumode, map__flags(map));
+		}
+	}
+	return 0;
+}
+
 int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *event,
 			       struct perf_sample *sample,
 			       struct evsel *evsel __maybe_unused,
@@ -750,6 +773,11 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e
 {
 	struct addr_location al;
 	struct thread *thread;
+	struct mark_dso_hit_args args = {
+		.tool = tool,
+		.machine = machine,
+		.cpumode = sample->cpumode,
+	};
 
 	addr_location__init(&al);
 	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
@@ -769,6 +797,9 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e
 		}
 	}
 
+	sample__for_each_callchain_node(thread, evsel, sample, PERF_MAX_STACK_DEPTH,
+					mark_dso_hit_callback, &args);
+
 	thread__put(thread);
 repipe:
 	perf_event__repipe(tool, event, sample, machine);
-- 
cgit v1.2.3


From 92ec8b9367ab328cdd3d4409464137bc9775a2e5 Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Thu, 8 Aug 2024 07:59:02 +0000
Subject: selftests/bpf: Avoid subtraction after htons() in ipip tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On little-endian systems, doing subtraction after htons()
leads to interesting results:

Given:
  MAGIC_BYTES = 123 = 0x007B aka. in big endian: 0x7B00 = 31488
  sizeof(struct iphdr) = 20

Before this patch:
__bpf_constant_htons(MAGIC_BYTES) - sizeof(struct iphdr) = 0x7AEC
0x7AEC = htons(0xEC7A) = htons(60538)

So these were outer IP packets with a total length of 123 bytes,
containing an inner IP packet with a total length of 60538 bytes.

After this patch:
__bpf_constant_htons(MAGIC_BYTES - sizeof(struct iphdr)) = htons(103)

Now these packets are outer IP packets with a total length of 123 bytes,
containing an inner IP packet with a total length of 103 bytes.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Toke Høiland-Jørgensen <toke@kernel.org>
Link: https://lore.kernel.org/r/20240808075906.1849564-1-ast@fiberby.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 9e5f38739104..6b3078dd5645 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -378,8 +378,8 @@ struct test tests[] = {
 			.iph_inner.ihl = 5,
 			.iph_inner.protocol = IPPROTO_TCP,
 			.iph_inner.tot_len =
-				__bpf_constant_htons(MAGIC_BYTES) -
-				sizeof(struct iphdr),
+				__bpf_constant_htons(MAGIC_BYTES -
+				sizeof(struct iphdr)),
 			.tcp.doff = 5,
 			.tcp.source = 80,
 			.tcp.dest = 8080,
@@ -407,8 +407,8 @@ struct test tests[] = {
 			.iph_inner.ihl = 5,
 			.iph_inner.protocol = IPPROTO_TCP,
 			.iph_inner.tot_len =
-				__bpf_constant_htons(MAGIC_BYTES) -
-				sizeof(struct iphdr),
+				__bpf_constant_htons(MAGIC_BYTES -
+				sizeof(struct iphdr)),
 			.tcp.doff = 5,
 			.tcp.source = 80,
 			.tcp.dest = 8080,
@@ -436,8 +436,8 @@ struct test tests[] = {
 			.iph_inner.ihl = 5,
 			.iph_inner.protocol = IPPROTO_TCP,
 			.iph_inner.tot_len =
-				__bpf_constant_htons(MAGIC_BYTES) -
-				sizeof(struct iphdr),
+				__bpf_constant_htons(MAGIC_BYTES -
+				sizeof(struct iphdr)),
 			.tcp.doff = 5,
 			.tcp.source = 99,
 			.tcp.dest = 9090,
-- 
cgit v1.2.3


From 1c5144a066fd17eba681b3d6b0f86bda8e06fbfa Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:37:50 -0700
Subject: torture: Add torture.sh --guest-cpu-limit argument for limited hosts

Some servers have limitations on the number of CPUs a given guest OS
can use.  In my earlier experience, such limitations have been at least
half of the host's CPUs, but in a recent example, this limit is less
than 40%.  This commit therefore adds a --guest-cpu-limit argument that
allows such low limits to be made known to torture.sh.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/torture.sh | 38 ++++++++++++++++-------
 1 file changed, 27 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
index 990d24696fd3..0447c4a00cc4 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -19,10 +19,10 @@ PATH=${RCUTORTURE}/bin:$PATH; export PATH
 
 TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`"
 MAKE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS*2))
-HALF_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2))
-if test "$HALF_ALLOTED_CPUS" -lt 1
+SCALE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2))
+if test "$SCALE_ALLOTED_CPUS" -lt 1
 then
-	HALF_ALLOTED_CPUS=1
+	SCALE_ALLOTED_CPUS=1
 fi
 VERBOSE_BATCH_CPUS=$((TORTURE_ALLOTED_CPUS/16))
 if test "$VERBOSE_BATCH_CPUS" -lt 2
@@ -90,6 +90,7 @@ usage () {
 	echo "       --do-scftorture / --do-no-scftorture / --no-scftorture"
 	echo "       --do-srcu-lockdep / --do-no-srcu-lockdep / --no-srcu-lockdep"
 	echo "       --duration [ <minutes> | <hours>h | <days>d ]"
+	echo "       --guest-cpu-limit N"
 	echo "       --kcsan-kmake-arg kernel-make-arguments"
 	exit 1
 }
@@ -203,6 +204,21 @@ do
 		duration_base=$(($ts*mult))
 		shift
 		;;
+	--guest-cpu-limit|--guest-cpu-lim)
+		checkarg --guest-cpu-limit "(number)" "$#" "$2" '^[0-9]*$' '^--'
+		if (("$2" <= "$TORTURE_ALLOTED_CPUS" / 2))
+		then
+			SCALE_ALLOTED_CPUS="$2"
+			VERBOSE_BATCH_CPUS="$((SCALE_ALLOTED_CPUS/8))"
+			if (("$VERBOSE_BATCH_CPUS" < 2))
+			then
+				VERBOSE_BATCH_CPUS=0
+			fi
+		else
+			echo "Ignoring value of $2 for --guest-cpu-limit which is greater than (("$TORTURE_ALLOTED_CPUS" / 2))."
+		fi
+		shift
+		;;
 	--kcsan-kmake-arg|--kcsan-kmake-args)
 		checkarg --kcsan-kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
 		kcsan_kmake_args="`echo "$kcsan_kmake_args $2" | sed -e 's/^ *//' -e 's/ *$//'`"
@@ -425,9 +441,9 @@ fi
 if test "$do_scftorture" = "yes"
 then
 	# Scale memory based on the number of CPUs.
-	scfmem=$((3+HALF_ALLOTED_CPUS/16))
-	torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1"
-	torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make
+	scfmem=$((3+SCALE_ALLOTED_CPUS/16))
+	torture_bootargs="scftorture.nthreads=$SCALE_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1"
+	torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory ${scfmem}G --trust-make
 fi
 
 if test "$do_rt" = "yes"
@@ -471,8 +487,8 @@ for prim in $primlist
 do
 	if test -n "$firsttime"
 	then
-		torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot"
-		torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make
+		torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$SCALE_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot"
+		torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make
 		mv $T/last-resdir-nodebug $T/first-resdir-nodebug || :
 		if test -f "$T/last-resdir-kasan"
 		then
@@ -520,8 +536,8 @@ for prim in $primlist
 do
 	if test -n "$firsttime"
 	then
-		torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot"
-		torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make
+		torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$SCALE_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot"
+		torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --trust-make
 		mv $T/last-resdir-nodebug $T/first-resdir-nodebug || :
 		if test -f "$T/last-resdir-kasan"
 		then
@@ -559,7 +575,7 @@ do_kcsan="$do_kcsan_save"
 if test "$do_kvfree" = "yes"
 then
 	torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot"
-	torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make
+	torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory 2G --trust-make
 fi
 
 if test "$do_clocksourcewd" = "yes"
-- 
cgit v1.2.3


From 3e49aea71d5bac93b892f954d14bc8070e4cc651 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 5 Jul 2024 19:57:47 -0700
Subject: refscale: Add TINY scenario

This commit adds a TINY scenario in order to support tests of Tiny RCU
and Tiny SRCU.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 .../selftests/rcutorture/configs/refscale/TINY       | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 tools/testing/selftests/rcutorture/configs/refscale/TINY

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/refscale/TINY b/tools/testing/selftests/rcutorture/configs/refscale/TINY
new file mode 100644
index 000000000000..759343980b80
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/TINY
@@ -0,0 +1,20 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
+#CHECK#CONFIG_PREEMPT_RCU=n
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_KPROBES=n
+CONFIG_FTRACE=n
-- 
cgit v1.2.3


From 41f37c852ac3fbfd072a00281b60dc7ba056be8c Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 13 Jun 2024 07:12:10 +0900
Subject: selftests/ftrace: Add required dependency for kprobe tests

kprobe_args_{char,string}.tc are using available_filter_functions file
which is provided by function tracer. Thus if function tracer is disabled,
these tests are failed on recent kernels because tracefs_create_dir is
not raised events by adding a dynamic event.
Add available_filter_functions to requires line.

Fixes: 7c1130ea5cae ("test: ftrace: Fix kprobe test for eventfs")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc   | 2 +-
 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc
index e21c9c27ece4..77f4c07cdcb8 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc
@@ -1,7 +1,7 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 # description: Kprobe event char type argument
-# requires: kprobe_events
+# requires: kprobe_events available_filter_functions
 
 case `uname -m` in
 x86_64)
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
index 93217d459556..39001073f7ed 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
@@ -1,7 +1,7 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 # description: Kprobe event string type argument
-# requires: kprobe_events
+# requires: kprobe_events available_filter_functions
 
 case `uname -m` in
 x86_64)
-- 
cgit v1.2.3


From a05031713d460de5ff8500680458ed8f3fd946e6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 21:06:12 -0700
Subject: perf disasm: Fix memory leak for locked operations

lock__parse() calls disasm_line__parse() passing
&ops->locked.ins.name that will use strdup() to populate it.

Ensure ops->locked.ins.name is freed in lock__delete().

Found with address/leak sanitizer.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Link: https://lore.kernel.org/r/20240813040613.882075-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 22289003e16d..226d2181f694 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -566,6 +566,7 @@ static void lock__delete(struct ins_operands *ops)
 		ins_ops__delete(ops->locked.ops);
 
 	zfree(&ops->locked.ops);
+	zfree(&ops->locked.ins.name);
 	zfree(&ops->target.raw);
 	zfree(&ops->target.name);
 }
-- 
cgit v1.2.3


From 653ac51f53032df8a17fd5f9d6c7540a5e70f85c Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 12 Aug 2024 21:06:13 -0700
Subject: perf test annotate: Dump trapping test in trap handler

Help to better identify the location of test failures but dumping the
failing test in the trap handler.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Link: https://lore.kernel.org/r/20240813040613.882075-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/annotate.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/annotate.sh b/tools/perf/tests/shell/annotate.sh
index b072d9b97387..2ccf4f1d46b6 100755
--- a/tools/perf/tests/shell/annotate.sh
+++ b/tools/perf/tests/shell/annotate.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # perf annotate basic tests
 # SPDX-License-Identifier: GPL-2.0
 
@@ -28,6 +28,7 @@ cleanup() {
 }
 
 trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
   cleanup
   exit 1
 }
-- 
cgit v1.2.3


From 3ef44458071a19e5b5832cdfe6f75273aa521b6e Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:00 -0700
Subject: perf report: Fix --total-cycles --stdio output error

The --total-cycles may output wrong information with the --stdio.

For example:

  # perf record -e "{cycles,instructions}",cache-misses -b sleep 1
  # perf report --total-cycles --stdio

The total cycles output of {cycles,instructions} and cache-misses are
almost the same.

  # Samples: 938  of events 'anon group { cycles, instructions }'
  # Event count (approx.): 938
  #
  # Sampled Cycles%  Sampled Cycles  Avg Cycles%  Avg Cycles  [Program Block Range]
  # ...............  ..............  ...........  ..........  ..................................................>
  #
            11.19%            2.6K        0.10%           21  [perf_iterate_ctx+48 -> >
             5.79%            1.4K        0.45%           97  [__intel_pmu_enable_all.constprop.0+80 -> __intel_>
             5.11%            1.2K        0.33%           71  [native_write_msr+0 ->>

  # Samples: 293  of event 'cache-misses'
  # Event count (approx.): 293
  #
  # Sampled Cycles%  Sampled Cycles  Avg Cycles%  Avg Cycles  [Program Block Range]
  # ...............  ..............  ...........  ..........  ..................................................>
  #
            11.19%            2.6K        0.13%           21  [perf_iterate_ctx+48 -> >
             5.79%            1.4K        0.59%           97  [__intel_pmu_enable_all.constprop.0+80 -> __intel_>
             5.11%            1.2K        0.43%           71  [native_write_msr+0 ->>

With the symbol_conf.event_group, the 'perf report' should only report the
block information of the leader event in a group.

However, the current implementation retrieves the next event's block
information, rather than the next group leader's block information.

Make sure the index is updated even if the event is skipped.

With the patch,

  # Samples: 293  of event 'cache-misses'
  # Event count (approx.): 293
  #
  # Sampled Cycles%  Sampled Cycles  Avg Cycles%  Avg Cycles  [Program Block Range]
  # ...............  ..............  ...........  ..........  ..................................................>
  #
           37.98%            9.0K        4.05%           299  [perf_event_addr_filters_exec+0 -> perf_event_a>
           11.19%            2.6K        0.28%            21  [perf_iterate_ctx+48 -> >
            5.79%            1.4K        1.32%            97  [__intel_pmu_enable_all.constprop.0+80 -> __intel_>

Fixes: 6f7164fa231a5f36 ("perf report: Sort by sampled cycles percent per block for stdio")
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-2-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index dfb47fa85e5c..04b9a5c1bc7e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -565,6 +565,7 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c
 		struct hists *hists = evsel__hists(pos);
 		const char *evname = evsel__name(pos);
 
+		i++;
 		if (symbol_conf.event_group && !evsel__is_group_leader(pos))
 			continue;
 
@@ -574,7 +575,7 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c
 		hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
 
 		if (rep->total_cycles_mode) {
-			report__browse_block_hists(&rep->block_reports[i++].hist,
+			report__browse_block_hists(&rep->block_reports[i - 1].hist,
 						   rep->min_percent, pos, NULL);
 			continue;
 		}
-- 
cgit v1.2.3


From 183212a45e5fe66e7f6cbd360d186174b013d023 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:01 -0700
Subject: perf report: Remove the first overflow check for branch counters

A false overflow warning is triggered if a sample doesn't have any LBRs
recorded and the branch counters feature is enabled.

The current code does OVERFLOW_CHECK_u64() at the very beginning when
reading the information of branch counters. It assumes that there is at
least one LBR in the PEBS record. But it is a valid case that 0 LBR is
recorded especially in a high context switch.

Remove the OVERFLOW_CHECK_u64(). The later OVERFLOW_CHECK() should be
good enough to check the overflow when reading the information of the
branch counters.

Fixes: 9fbb4b02302b0ae6 ("perf tools: Add branch counter knob")
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-3-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d607056b73c9..f22f402d54cc 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2884,8 +2884,6 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 		array = (void *)array + sz;
 
 		if (evsel__has_branch_counters(evsel)) {
-			OVERFLOW_CHECK_u64(array);
-
 			data->branch_stack_cntr = (u64 *)array;
 			sz = data->branch_stack->nr * sizeof(u64);
 
-- 
cgit v1.2.3


From 3a867a6dadb2d67b85cbf9bc67eca2428075a109 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:02 -0700
Subject: perf evlist: Save branch counters information

The branch counters logging (A.K.A LBR event logging) introduces a
per-counter indication of precise event occurrences in LBRs. The kernel
only dumps the number of occurrences into a record. The perf tool has
to map the number to the corresponding event.

Add evlist__update_br_cntr() to go through the evlist to pick the
events that are configured to be logged. Assign a logical idx to track
them, and add the total number of the events in the leader event.

The total number will be used to allocate the space to save the branch
counters for a block. The logical idx will be used to locate the
corresponding event quickly in the following patches.

It only needs to iterate the evlist once. The
evsel__has_branch_counters() is also optimized.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-4-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 15 +++++++++++++++
 tools/perf/util/evlist.h |  2 ++
 tools/perf/util/evsel.c  | 13 +++++++------
 tools/perf/util/evsel.h  |  8 ++++++++
 4 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 4b2538e4f679..68bbd3ea771b 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -79,6 +79,7 @@ void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus,
 	evlist->ctl_fd.fd = -1;
 	evlist->ctl_fd.ack = -1;
 	evlist->ctl_fd.pos = -1;
+	evlist->nr_br_cntr = -1;
 }
 
 struct evlist *evlist__new(void)
@@ -1264,6 +1265,20 @@ u64 evlist__combined_branch_type(struct evlist *evlist)
 	return branch_type;
 }
 
+void evlist__update_br_cntr(struct evlist *evlist)
+{
+	struct evsel *evsel;
+	int i = 0;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) {
+			evsel->br_cntr_idx = i++;
+			evsel__leader(evsel)->br_cntr_nr++;
+		}
+	}
+	evlist->nr_br_cntr = i;
+}
+
 bool evlist__valid_read_format(struct evlist *evlist)
 {
 	struct evsel *first = evlist__first(evlist), *pos = first;
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index cccc34da5a02..b46f1a320783 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -57,6 +57,7 @@ struct evlist {
 	bool		 enabled;
 	int		 id_pos;
 	int		 is_pos;
+	int		 nr_br_cntr;
 	u64		 combined_sample_type;
 	enum bkw_mmap_state bkw_mmap_state;
 	struct {
@@ -219,6 +220,7 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel,
 u64 __evlist__combined_sample_type(struct evlist *evlist);
 u64 evlist__combined_sample_type(struct evlist *evlist);
 u64 evlist__combined_branch_type(struct evlist *evlist);
+void evlist__update_br_cntr(struct evlist *evlist);
 bool evlist__sample_id_all(struct evlist *evlist);
 u16 evlist__id_hdr_size(struct evlist *evlist);
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f22f402d54cc..38a74d6dde49 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2636,17 +2636,18 @@ u64 evsel__bitfield_swap_branch_flags(u64 value)
 
 static inline bool evsel__has_branch_counters(const struct evsel *evsel)
 {
-	struct evsel *cur, *leader = evsel__leader(evsel);
+	struct evsel *leader = evsel__leader(evsel);
 
 	/* The branch counters feature only supports group */
 	if (!leader || !evsel->evlist)
 		return false;
 
-	evlist__for_each_entry(evsel->evlist, cur) {
-		if ((leader == evsel__leader(cur)) &&
-		    (cur->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS))
-			return true;
-	}
+	if (evsel->evlist->nr_br_cntr < 0)
+		evlist__update_br_cntr(evsel->evlist);
+
+	if (leader->br_cntr_nr > 0)
+		return true;
+
 	return false;
 }
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a5da4b03bb1c..a393dae1dc96 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -148,6 +148,14 @@ struct evsel {
 	 */
 	__u64			synth_sample_type;
 
+	/*
+	 * Store the branch counter related information.
+	 * br_cntr_idx: The idx of the branch counter event in the evlist
+	 * br_cntr_nr:  The number of the branch counter event in the group
+	 *	        (Only available for the leader event)
+	 */
+	int			br_cntr_idx;
+	int			br_cntr_nr;
 	/*
 	 * bpf_counter_ops serves two use cases:
 	 *   1. perf-stat -b          counting events used byBPF programs
-- 
cgit v1.2.3


From 1f2b7fbb04f53540090ac7d320194654f142443f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:03 -0700
Subject: perf annotate: Save branch counters for each block

When annotating a basic block, it's useful to display the occurrences
of other events in the block.

The branch counter feature is only available for newer Intel platforms.

So a dedicated option to display the branch counters is not introduced.

Reuse the existing --total-cycles option, which triggers the annotation
of a basic block and displays the cycle-related annotation.

When the branch counters information is available, the branch counters
are automatically appended after all the cycle-related annotation.

Accounting the branch counters as well when accounting the cycles in
hist__account_cycles().

In 'struct annotated_branch', introduce a br_cntr array to save the
accumulation of each branch counter.

In a sample, all the branch counters for a branch are saved in a u64
space.

Because the saturation of a branch counter is small, e.g., for Intel
Sierra Forest, the saturation is only 3.

Add ANNOTATION__BR_CNTR_SATURATED_FLAG to indicate if a branch counter
once saturated. That can be used to indicate a potential event lost
because of the saturation.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-5-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c |  3 +-
 tools/perf/builtin-diff.c     |  4 +--
 tools/perf/builtin-report.c   |  2 +-
 tools/perf/builtin-top.c      |  4 +--
 tools/perf/util/annotate.c    | 68 +++++++++++++++++++++++++++++++++++--------
 tools/perf/util/annotate.h    | 10 ++++++-
 tools/perf/util/branch.h      |  1 +
 tools/perf/util/hist.c        |  5 ++--
 tools/perf/util/hist.h        |  2 +-
 tools/perf/util/machine.c     |  3 ++
 10 files changed, 80 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index d6f6ba5a569d..9b4a8a379b5b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -221,7 +221,8 @@ static int process_branch_callback(struct evsel *evsel,
 	if (a.map != NULL)
 		dso__set_hit(map__dso(a.map));
 
-	hist__account_cycles(sample->branch_stack, al, sample, false, NULL);
+	hist__account_cycles(sample->branch_stack, al, sample, false,
+			     NULL, evsel);
 
 	ret = hist_entry_iter__add(&iter, &a, PERF_MAX_STACK_DEPTH, ann);
 out:
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 28c5208fcdc9..6983b5373372 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -431,8 +431,8 @@ static int diff__process_sample_event(const struct perf_tool *tool,
 			goto out;
 		}
 
-		hist__account_cycles(sample->branch_stack, &al, sample, false,
-				     NULL);
+		hist__account_cycles(sample->branch_stack, &al, sample,
+				     false, NULL, evsel);
 		break;
 
 	case COMPUTE_STREAM:
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 04b9a5c1bc7e..ab450644e5b3 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -328,7 +328,7 @@ static int process_sample_event(const struct perf_tool *tool,
 	if (ui__has_annotation() || rep->symbol_ipc || rep->total_cycles_mode) {
 		hist__account_cycles(sample->branch_stack, &al, sample,
 				     rep->nonany_branch_mode,
-				     &rep->total_cycles);
+				     &rep->total_cycles, evsel);
 	}
 
 	ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 1cdac4e190e9..881b861c35ee 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -735,8 +735,8 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter,
 		perf_top__record_precise_ip(top, iter->he, iter->sample, evsel, al->addr);
 
 	hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
-		     !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY),
-		     NULL);
+			     !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY),
+			     NULL, evsel);
 	return 0;
 }
 
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index a87d2e97e10d..c269758d47c8 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -266,22 +266,30 @@ struct annotated_branch *annotation__get_branch(struct annotation *notes)
 	return notes->branch;
 }
 
-static struct cyc_hist *symbol__cycles_hist(struct symbol *sym)
+static struct annotated_branch *symbol__find_branch_hist(struct symbol *sym,
+							 unsigned int br_cntr_nr)
 {
 	struct annotation *notes = symbol__annotation(sym);
 	struct annotated_branch *branch;
+	const size_t size = symbol__size(sym);
 
 	branch = annotation__get_branch(notes);
 	if (branch == NULL)
 		return NULL;
 
 	if (branch->cycles_hist == NULL) {
-		const size_t size = symbol__size(sym);
-
 		branch->cycles_hist = calloc(size, sizeof(struct cyc_hist));
+		if (!branch->cycles_hist)
+			return NULL;
+	}
+
+	if (br_cntr_nr && branch->br_cntr == NULL) {
+		branch->br_cntr = calloc(br_cntr_nr * size, sizeof(u64));
+		if (!branch->br_cntr)
+			return NULL;
 	}
 
-	return branch->cycles_hist;
+	return branch;
 }
 
 struct annotated_source *symbol__hists(struct symbol *sym, int nr_hists)
@@ -316,16 +324,44 @@ static int symbol__inc_addr_samples(struct map_symbol *ms,
 	return src ? __symbol__inc_addr_samples(ms, src, evsel->core.idx, addr, sample) : 0;
 }
 
-static int symbol__account_cycles(u64 addr, u64 start,
-				  struct symbol *sym, unsigned cycles)
+static int symbol__account_br_cntr(struct annotated_branch *branch,
+				   struct evsel *evsel,
+				   unsigned offset,
+				   u64 br_cntr)
+{
+	unsigned int br_cntr_nr = evsel__leader(evsel)->br_cntr_nr;
+	unsigned int base = evsel__leader(evsel)->br_cntr_idx;
+	unsigned int width = evsel__env(evsel)->br_cntr_width;
+	unsigned int off = offset * evsel->evlist->nr_br_cntr;
+	unsigned int i, mask = (1L << width) - 1;
+	u64 *branch_br_cntr = branch->br_cntr;
+
+	if (!br_cntr || !branch_br_cntr)
+		return 0;
+
+	for (i = 0; i < br_cntr_nr; i++) {
+		u64 cntr = (br_cntr >> i * width) & mask;
+
+		branch_br_cntr[off + i + base] += cntr;
+		if (cntr == mask)
+			branch_br_cntr[off + i + base] |= ANNOTATION__BR_CNTR_SATURATED_FLAG;
+	}
+
+	return 0;
+}
+
+static int symbol__account_cycles(u64 addr, u64 start, struct symbol *sym,
+				  unsigned cycles, struct evsel *evsel,
+				  u64 br_cntr)
 {
-	struct cyc_hist *cycles_hist;
+	struct annotated_branch *branch;
 	unsigned offset;
+	int ret;
 
 	if (sym == NULL)
 		return 0;
-	cycles_hist = symbol__cycles_hist(sym);
-	if (cycles_hist == NULL)
+	branch = symbol__find_branch_hist(sym, evsel->evlist->nr_br_cntr);
+	if (!branch)
 		return -ENOMEM;
 	if (addr < sym->start || addr >= sym->end)
 		return -ERANGE;
@@ -337,15 +373,22 @@ static int symbol__account_cycles(u64 addr, u64 start,
 			start = 0;
 	}
 	offset = addr - sym->start;
-	return __symbol__account_cycles(cycles_hist,
+	ret = __symbol__account_cycles(branch->cycles_hist,
 					start ? start - sym->start : 0,
 					offset, cycles,
 					!!start);
+
+	if (ret)
+		return ret;
+
+	return symbol__account_br_cntr(branch, evsel, offset, br_cntr);
 }
 
 int addr_map_symbol__account_cycles(struct addr_map_symbol *ams,
 				    struct addr_map_symbol *start,
-				    unsigned cycles)
+				    unsigned cycles,
+				    struct evsel *evsel,
+				    u64 br_cntr)
 {
 	u64 saddr = 0;
 	int err;
@@ -371,7 +414,7 @@ int addr_map_symbol__account_cycles(struct addr_map_symbol *ams,
 			start ? start->addr : 0,
 			ams->ms.sym ? ams->ms.sym->start + map__start(ams->ms.map) : 0,
 			saddr);
-	err = symbol__account_cycles(ams->al_addr, saddr, ams->ms.sym, cycles);
+	err = symbol__account_cycles(ams->al_addr, saddr, ams->ms.sym, cycles, evsel, br_cntr);
 	if (err)
 		pr_debug2("account_cycles failed %d\n", err);
 	return err;
@@ -412,6 +455,7 @@ static void annotated_branch__delete(struct annotated_branch *branch)
 {
 	if (branch) {
 		zfree(&branch->cycles_hist);
+		free(branch->br_cntr);
 		free(branch);
 	}
 }
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 0e52558baa01..6b3023f3b18f 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -14,6 +14,7 @@
 #include "spark.h"
 #include "hashmap.h"
 #include "disasm.h"
+#include "branch.h"
 
 struct hist_browser_timer;
 struct hist_entry;
@@ -288,6 +289,9 @@ struct annotated_source {
 struct annotation_line *annotated_source__get_line(struct annotated_source *src,
 						   s64 offset);
 
+/* A branch counter once saturated */
+#define ANNOTATION__BR_CNTR_SATURATED_FLAG	(1ULL << 63)
+
 /**
  * struct annotated_branch - basic block and IPC information for a symbol.
  *
@@ -297,6 +301,7 @@ struct annotation_line *annotated_source__get_line(struct annotated_source *src,
  * @cover_insn: Number of distinct, actually executed instructions.
  * @cycles_hist: Array of cyc_hist for each instruction.
  * @max_coverage: Maximum number of covered basic block (used for block-range).
+ * @br_cntr: Array of the occurrences of events (branch counters) during a block.
  *
  * This struct is used by two different codes when the sample has branch stack
  * and cycles information.  annotation__compute_ipc() calculates average IPC
@@ -313,6 +318,7 @@ struct annotated_branch {
 	unsigned int		cover_insn;
 	struct cyc_hist		*cycles_hist;
 	u64			max_coverage;
+	u64			*br_cntr;
 };
 
 struct LOCKABLE annotation {
@@ -383,7 +389,9 @@ struct annotated_branch *annotation__get_branch(struct annotation *notes);
 
 int addr_map_symbol__account_cycles(struct addr_map_symbol *ams,
 				    struct addr_map_symbol *start,
-				    unsigned cycles);
+				    unsigned cycles,
+				    struct evsel *evsel,
+				    u64 br_cntr);
 
 int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample,
 				 struct evsel *evsel, u64 addr);
diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h
index 87704d713ff6..b80c12c74bbb 100644
--- a/tools/perf/util/branch.h
+++ b/tools/perf/util/branch.h
@@ -34,6 +34,7 @@ struct branch_info {
 	struct addr_map_symbol from;
 	struct addr_map_symbol to;
 	struct branch_flags    flags;
+	u64		       branch_stack_cntr;
 	char		       *srcline_from;
 	char		       *srcline_to;
 };
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index c8c1b511f8a7..0f554febf9a1 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -2677,7 +2677,7 @@ int hists__unlink(struct hists *hists)
 
 void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
 			  struct perf_sample *sample, bool nonany_branch_mode,
-			  u64 *total_cycles)
+			  u64 *total_cycles, struct evsel *evsel)
 {
 	struct branch_info *bi;
 	struct branch_entry *entries = perf_sample__branch_entries(sample);
@@ -2701,7 +2701,8 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
 			for (int i = bs->nr - 1; i >= 0; i--) {
 				addr_map_symbol__account_cycles(&bi[i].from,
 					nonany_branch_mode ? NULL : prev,
-					bi[i].flags.cycles);
+					bi[i].flags.cycles, evsel,
+					bi[i].branch_stack_cntr);
 				prev = &bi[i].to;
 
 				if (total_cycles)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 5273f5c37050..30c13fc8cbe4 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -742,7 +742,7 @@ unsigned int hists__overhead_width(struct hists *hists);
 
 void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
 			  struct perf_sample *sample, bool nonany_branch_mode,
-			  u64 *total_cycles);
+			  u64 *total_cycles, struct evsel *evsel);
 
 struct option;
 int parse_filter_percentage(const struct option *opt, const char *arg, int unset);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 706be5e4a076..c08774d06d14 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2141,6 +2141,7 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
 	unsigned int i;
 	const struct branch_stack *bs = sample->branch_stack;
 	struct branch_entry *entries = perf_sample__branch_entries(sample);
+	u64 *branch_stack_cntr = sample->branch_stack_cntr;
 	struct branch_info *bi = calloc(bs->nr, sizeof(struct branch_info));
 
 	if (!bi)
@@ -2150,6 +2151,8 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
 		ip__resolve_ams(al->thread, &bi[i].to, entries[i].to);
 		ip__resolve_ams(al->thread, &bi[i].from, entries[i].from);
 		bi[i].flags = entries[i].flags;
+		if (branch_stack_cntr)
+			bi[i].branch_stack_cntr  = branch_stack_cntr[i];
 	}
 	return bi;
 }
-- 
cgit v1.2.3


From 7398bf181d59a8c8d8d13cb555bf3f44b70f6221 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:04 -0700
Subject: perf evsel: Assign abbr name for the branch counter events

There could be several branch counter events. If perf tool output the
result via the format "event name + a number", the line could be very
long and hard to read.

An abbreviation is introduced to replace the full event name in the
display. The abbreviation starts from 'A' to 'Z9', which can support
up to 286 events. The same abbreviation will be assigned if the same
events are found in the evlist. The next patch will utilize the
abbreviation name to show the branch counter events in the output.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-6-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/evsel.h  |  6 ++++++
 2 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 68bbd3ea771b..c0379fa1ccfe 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -33,6 +33,7 @@
 #include "util/bpf-filter.h"
 #include "util/stat.h"
 #include "util/util.h"
+#include "util/env.h"
 #include "util/intel-tpebs.h"
 #include <signal.h>
 #include <unistd.h>
@@ -1265,15 +1266,67 @@ u64 evlist__combined_branch_type(struct evlist *evlist)
 	return branch_type;
 }
 
+static struct evsel *
+evlist__find_dup_event_from_prev(struct evlist *evlist, struct evsel *event)
+{
+	struct evsel *pos;
+
+	evlist__for_each_entry(evlist, pos) {
+		if (event == pos)
+			break;
+		if ((pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) &&
+		    !strcmp(pos->name, event->name))
+			return pos;
+	}
+	return NULL;
+}
+
+#define MAX_NR_ABBR_NAME	(26 * 11)
+
+/*
+ * The abbr name is from A to Z9. If the number of event
+ * which requires the branch counter > MAX_NR_ABBR_NAME,
+ * return NA.
+ */
+static void evlist__new_abbr_name(char *name)
+{
+	static int idx;
+	int i = idx / 26;
+
+	if (idx >= MAX_NR_ABBR_NAME) {
+		name[0] = 'N';
+		name[1] = 'A';
+		name[2] = '\0';
+		return;
+	}
+
+	name[0] = 'A' + (idx % 26);
+
+	if (!i)
+		name[1] = '\0';
+	else {
+		name[1] = '0' + i - 1;
+		name[2] = '\0';
+	}
+
+	idx++;
+}
+
 void evlist__update_br_cntr(struct evlist *evlist)
 {
-	struct evsel *evsel;
+	struct evsel *evsel, *dup;
 	int i = 0;
 
 	evlist__for_each_entry(evlist, evsel) {
 		if (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) {
 			evsel->br_cntr_idx = i++;
 			evsel__leader(evsel)->br_cntr_nr++;
+
+			dup = evlist__find_dup_event_from_prev(evlist, evsel);
+			if (dup)
+				memcpy(evsel->abbr_name, dup->abbr_name, 3 * sizeof(char));
+			else
+				evlist__new_abbr_name(evsel->abbr_name);
 		}
 	}
 	evlist->nr_br_cntr = i;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a393dae1dc96..4316992f6a69 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -153,9 +153,15 @@ struct evsel {
 	 * br_cntr_idx: The idx of the branch counter event in the evlist
 	 * br_cntr_nr:  The number of the branch counter event in the group
 	 *	        (Only available for the leader event)
+	 * abbr_name:   The abbreviation name assigned to an event which is
+	 *		logged by the branch counter.
+	 *		The abbr name is from A to Z9. NA is applied if out
+	 *		of the range.
 	 */
 	int			br_cntr_idx;
 	int			br_cntr_nr;
+	char			abbr_name[3];
+
 	/*
 	 * bpf_counter_ops serves two use cases:
 	 *   1. perf-stat -b          counting events used byBPF programs
-- 
cgit v1.2.3


From 20d6f555283915f24d52e29a982b547cf6517f06 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:05 -0700
Subject: perf report: Display the branch counter histogram

Reusing the existing --total-cycles option to display the branch
counters. Add a new PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER to display
the logged branch counter events. They are shown right after all the
cycle-related annotations.

Extend the 'struct block_info' to store and pass the branch counter
related information.

The annotation_br_cntr_entry() is to print the histogram of each branch
counter event. If the number of logged events is less than 4, the exact
number of the abbr name is printed. Otherwise, using '+' to stands for
more than 3 events.

Assume the number of logged events is less than 4.

The annotation_br_cntr_abbr_list() prints the branch counter's
abbreviation list. Press 'B' to display the list in the TUI mode.

  $ perf record -e "{branch-instructions:ppp,branch-misses}:S" -j any,counter
  $ perf report  --total-cycles --stdio

  # To display the perf.data header info, please use --header/--header-only options.
  #
  #
  # Total Lost Samples: 0
  #
  # Samples: 1M of events 'anon group { branch-instructions:ppp, branch-misses }'
  # Event count (approx.): 1610046
  #
  # Branch counter abbr list:
  # branch-instructions:ppp = A
  # branch-misses = B
  # '-' No event occurs
  # '+' Event occurrences may be lost due to branch counter saturated
  #
  # Sampled Cycles%  Sampled Cycles  Avg Cycles%  Avg Cycles  Branch Counter [Program Block Range]
  # ...............  ..............  ...........  ..........  ..............  ..................
  #
            57.55%            2.5M        0.00%           3     |A   |-   |                 ...
            25.27%            1.1M        0.00%           2     |AA  |-   |                 ...
            15.61%          667.2K        0.00%           1     |A   |-   |                 ...
             0.16%            6.9K        0.81%         575     |A   |-   |                 ...
             0.16%            6.8K        1.38%         977     |AA  |-   |                 ...
             0.16%            6.8K        0.04%          28     |AA  |B   |                 ...
             0.15%            6.6K        1.33%         946     |A   |-   |                 ...
             0.11%            4.5K        0.06%          46     |AAA+|-   |                 ...
             0.10%            4.4K        0.88%         624     |A   |-   |                 ...
             0.09%            3.7K        0.74%         524     |AAA+|B   |                 ...

With -v applied,

  # Sampled Cycles%  Sampled Cycles  Avg Cycles%  Avg Cycles  Branch Counter [Program Block Range]
  # ...............  ..............  ...........  ..........  ..............  ..................
  #
            57.55%            2.5M        0.00%           3       A=1 ,B=-                  ...
            25.27%            1.1M        0.00%           2       A=2 ,B=-                  ...
            15.61%          667.2K        0.00%           1       A=1 ,B=-                  ...
             0.16%            6.9K        0.81%         575       A=1 ,B=-                  ...
             0.16%            6.8K        1.38%         977       A=2 ,B=-                  ...
             0.16%            6.8K        0.04%          28       A=2 ,B=1                  ...
             0.15%            6.6K        1.33%         946       A=1 ,B=-                  ...
             0.11%            4.5K        0.06%          46       A=3+,B=-                  ...
             0.10%            4.4K        0.88%         624       A=1 ,B=-                  ...
             0.09%            3.7K        0.74%         524       A=3+,B=1                  ...

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-7-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-report.txt |   1 +
 tools/perf/builtin-diff.c                |   4 +-
 tools/perf/builtin-report.c              |  20 ++++-
 tools/perf/ui/browsers/hists.c           |  17 +++-
 tools/perf/util/annotate.c               | 145 +++++++++++++++++++++++++++++++
 tools/perf/util/annotate.h               |   3 +
 tools/perf/util/block-info.c             |  66 ++++++++++++--
 tools/perf/util/block-info.h             |   8 +-
 8 files changed, 246 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index d2b1593ef700..7c66d81ab978 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -614,6 +614,7 @@ include::itrace.txt[]
 	'Avg Cycles%'     - block average sampled cycles / sum of total block average
 			    sampled cycles
 	'Avg Cycles'      - block average sampled cycles
+	'Branch Counter'  - block branch counter histogram (with -v showing the number)
 
 --skip-empty::
 	Do not print 0 results in the --stat output.
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 6983b5373372..23326dd20333 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -691,7 +691,7 @@ static void hists__precompute(struct hists *hists)
 		if (compute == COMPUTE_CYCLES) {
 			bh = container_of(he, struct block_hist, he);
 			init_block_hist(bh);
-			block_info__process_sym(he, bh, NULL, 0);
+			block_info__process_sym(he, bh, NULL, 0, 0);
 		}
 
 		data__for_each_file_new(i, d) {
@@ -714,7 +714,7 @@ static void hists__precompute(struct hists *hists)
 				pair_bh = container_of(pair, struct block_hist,
 						       he);
 				init_block_hist(pair_bh);
-				block_info__process_sym(pair, pair_bh, NULL, 0);
+				block_info__process_sym(pair, pair_bh, NULL, 0, 0);
 
 				bh = container_of(he, struct block_hist, he);
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index ab450644e5b3..1643113616f4 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -575,6 +575,13 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c
 		hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
 
 		if (rep->total_cycles_mode) {
+			char *buf;
+
+			if (!annotation_br_cntr_abbr_list(&buf, pos, true)) {
+				fprintf(stdout, "%s", buf);
+				fprintf(stdout, "#\n");
+				free(buf);
+			}
 			report__browse_block_hists(&rep->block_reports[i - 1].hist,
 						   rep->min_percent, pos, NULL);
 			continue;
@@ -1119,18 +1126,23 @@ static int __cmd_report(struct report *rep)
 	report__output_resort(rep);
 
 	if (rep->total_cycles_mode) {
-		int block_hpps[6] = {
+		int nr_hpps = 4;
+		int block_hpps[PERF_HPP_REPORT__BLOCK_MAX_INDEX] = {
 			PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT,
 			PERF_HPP_REPORT__BLOCK_LBR_CYCLES,
 			PERF_HPP_REPORT__BLOCK_CYCLES_PCT,
 			PERF_HPP_REPORT__BLOCK_AVG_CYCLES,
-			PERF_HPP_REPORT__BLOCK_RANGE,
-			PERF_HPP_REPORT__BLOCK_DSO,
 		};
 
+		if (session->evlist->nr_br_cntr > 0)
+			block_hpps[nr_hpps++] = PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER;
+
+		block_hpps[nr_hpps++] = PERF_HPP_REPORT__BLOCK_RANGE;
+		block_hpps[nr_hpps++] = PERF_HPP_REPORT__BLOCK_DSO;
+
 		rep->block_reports = block_info__create_report(session->evlist,
 							       rep->total_cycles,
-							       block_hpps, 6,
+							       block_hpps, nr_hpps,
 							       &rep->nr_block_reports);
 		if (!rep->block_reports)
 			return -1;
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index b7219df51236..970f7f349298 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -3684,8 +3684,10 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel,
 	struct hist_browser *browser;
 	int key = -1;
 	struct popup_action action;
+	char *br_cntr_text = NULL;
 	static const char help[] =
-	" q             Quit \n";
+	" q             Quit \n"
+	" B             Branch counter abbr list (Optional)\n";
 
 	browser = hist_browser__new(hists);
 	if (!browser)
@@ -3703,6 +3705,8 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel,
 
 	memset(&action, 0, sizeof(action));
 
+	annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false);
+
 	while (1) {
 		key = hist_browser__run(browser, "? - help", true, 0);
 
@@ -3723,6 +3727,16 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel,
 			action.ms.sym = browser->selection->sym;
 			do_annotate(browser, &action);
 			continue;
+		case 'B':
+			if (br_cntr_text) {
+				ui__question_window("Branch counter abbr list",
+						    br_cntr_text, "Press any key...", 0);
+			} else {
+				ui__question_window("Branch counter abbr list",
+						    "\n The branch counter is not available.\n",
+						    "Press any key...", 0);
+			}
+			continue;
 		default:
 			break;
 		}
@@ -3730,5 +3744,6 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel,
 
 out:
 	hist_browser__delete(browser);
+	free(br_cntr_text);
 	return 0;
 }
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index c269758d47c8..5200cafe20e7 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -41,6 +41,7 @@
 #include "namespaces.h"
 #include "thread.h"
 #include "hashmap.h"
+#include "strbuf.h"
 #include <regex.h>
 #include <linux/bitops.h>
 #include <linux/kernel.h>
@@ -48,6 +49,7 @@
 #include <linux/zalloc.h>
 #include <subcmd/parse-options.h>
 #include <subcmd/run-command.h>
+#include <math.h>
 
 /* FIXME: For the HE_COLORSET */
 #include "ui/browser.h"
@@ -1719,6 +1721,149 @@ static void ipc_coverage_string(char *bf, int size, struct annotation *notes)
 		  ipc, coverage);
 }
 
+int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header)
+{
+	struct evsel *pos;
+	struct strbuf sb;
+
+	if (evsel->evlist->nr_br_cntr <= 0)
+		return -ENOTSUP;
+
+	strbuf_init(&sb, /*hint=*/ 0);
+
+	if (header && strbuf_addf(&sb, "# Branch counter abbr list:\n"))
+		goto err;
+
+	evlist__for_each_entry(evsel->evlist, pos) {
+		if (!(pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS))
+			continue;
+		if (header && strbuf_addf(&sb, "#"))
+			goto err;
+
+		if (strbuf_addf(&sb, " %s = %s\n", pos->name, pos->abbr_name))
+			goto err;
+	}
+
+	if (header && strbuf_addf(&sb, "#"))
+		goto err;
+	if (strbuf_addf(&sb, " '-' No event occurs\n"))
+		goto err;
+
+	if (header && strbuf_addf(&sb, "#"))
+		goto err;
+	if (strbuf_addf(&sb, " '+' Event occurrences may be lost due to branch counter saturated\n"))
+		goto err;
+
+	*str = strbuf_detach(&sb, NULL);
+
+	return 0;
+err:
+	strbuf_release(&sb);
+	return -ENOMEM;
+}
+
+/* Assume the branch counter saturated at 3 */
+#define ANNOTATION_BR_CNTR_SATURATION		3
+
+int annotation_br_cntr_entry(char **str, int br_cntr_nr,
+			     u64 *br_cntr, int num_aggr,
+			     struct evsel *evsel)
+{
+	struct evsel *pos = evsel ? evlist__first(evsel->evlist) : NULL;
+	bool saturated = false;
+	int i, j, avg, used;
+	struct strbuf sb;
+
+	strbuf_init(&sb, /*hint=*/ 0);
+	for (i = 0; i < br_cntr_nr; i++) {
+		used = 0;
+		avg = ceil((double)(br_cntr[i] & ~ANNOTATION__BR_CNTR_SATURATED_FLAG) /
+			   (double)num_aggr);
+
+		/*
+		 * A histogram with the abbr name is displayed by default.
+		 * With -v, the exact number of branch counter is displayed.
+		 */
+		if (verbose) {
+			evlist__for_each_entry_from(evsel->evlist, pos) {
+				if ((pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) &&
+				    (pos->br_cntr_idx == i))
+				break;
+			}
+			if (strbuf_addstr(&sb, pos->abbr_name))
+				goto err;
+
+			if (!br_cntr[i]) {
+				if (strbuf_addstr(&sb, "=-"))
+					goto err;
+			} else {
+				if (strbuf_addf(&sb, "=%d", avg))
+					goto err;
+			}
+			if (br_cntr[i] & ANNOTATION__BR_CNTR_SATURATED_FLAG) {
+				if (strbuf_addch(&sb, '+'))
+					goto err;
+			} else {
+				if (strbuf_addch(&sb, ' '))
+					goto err;
+			}
+
+			if ((i < br_cntr_nr - 1) && strbuf_addch(&sb, ','))
+				goto err;
+			continue;
+		}
+
+		if (strbuf_addch(&sb, '|'))
+			goto err;
+
+		if (!br_cntr[i]) {
+			if (strbuf_addch(&sb, '-'))
+				goto err;
+			used++;
+		} else {
+			evlist__for_each_entry_from(evsel->evlist, pos) {
+				if ((pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) &&
+				    (pos->br_cntr_idx == i))
+					break;
+			}
+			if (br_cntr[i] & ANNOTATION__BR_CNTR_SATURATED_FLAG)
+				saturated = true;
+
+			for (j = 0; j < avg; j++, used++) {
+				/* Print + if the number of logged events > 3 */
+				if (j >= ANNOTATION_BR_CNTR_SATURATION) {
+					saturated = true;
+					break;
+				}
+				if (strbuf_addstr(&sb, pos->abbr_name))
+					goto err;
+			}
+
+			if (saturated) {
+				if (strbuf_addch(&sb, '+'))
+					goto err;
+				used++;
+			}
+			pos = list_next_entry(pos, core.node);
+		}
+
+		for (j = used; j < ANNOTATION_BR_CNTR_SATURATION + 1; j++) {
+			if (strbuf_addch(&sb, ' '))
+				goto err;
+		}
+	}
+
+	if (!verbose && strbuf_addch(&sb, br_cntr_nr ? '|' : ' '))
+		goto err;
+
+	*str = strbuf_detach(&sb, NULL);
+
+	return 0;
+err:
+	strbuf_release(&sb);
+	return -ENOMEM;
+}
+
 static void __annotation_line__write(struct annotation_line *al, struct annotation *notes,
 				     bool first_line, bool current_entry, bool change_color, int width,
 				     void *obj, unsigned int percent_type,
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 6b3023f3b18f..4fdaa3d5e8d2 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -553,4 +553,7 @@ int annotate_get_basic_blocks(struct symbol *sym, s64 src, s64 dst,
 
 void debuginfo_cache__delete(void);
 
+int annotation_br_cntr_entry(char **str, int br_cntr_nr, u64 *br_cntr,
+			     int num_aggr, struct evsel *evsel);
+int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header);
 #endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c
index 04068d48683f..649392bee7ed 100644
--- a/tools/perf/util/block-info.c
+++ b/tools/perf/util/block-info.c
@@ -40,16 +40,32 @@ static struct block_header_column {
 	[PERF_HPP_REPORT__BLOCK_DSO] = {
 		.name = "Shared Object",
 		.width = 20,
+	},
+	[PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER] = {
+		.name = "Branch Counter",
+		.width = 30,
 	}
 };
 
-struct block_info *block_info__new(void)
+static struct block_info *block_info__new(unsigned int br_cntr_nr)
 {
-	return zalloc(sizeof(struct block_info));
+	struct block_info *bi = zalloc(sizeof(struct block_info));
+
+	if (bi && br_cntr_nr) {
+		bi->br_cntr = calloc(br_cntr_nr, sizeof(u64));
+		if (!bi->br_cntr) {
+			free(bi);
+			return NULL;
+		}
+	}
+
+	return bi;
 }
 
 void block_info__delete(struct block_info *bi)
 {
+	if (bi)
+		free(bi->br_cntr);
 	free(bi);
 }
 
@@ -86,7 +102,8 @@ int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 
 static void init_block_info(struct block_info *bi, struct symbol *sym,
 			    struct cyc_hist *ch, int offset,
-			    u64 total_cycles)
+			    u64 total_cycles, unsigned int br_cntr_nr,
+			    u64 *br_cntr, struct evsel *evsel)
 {
 	bi->sym = sym;
 	bi->start = ch->start;
@@ -99,10 +116,18 @@ static void init_block_info(struct block_info *bi, struct symbol *sym,
 
 	memcpy(bi->cycles_spark, ch->cycles_spark,
 	       NUM_SPARKS * sizeof(u64));
+
+	if (br_cntr && br_cntr_nr) {
+		bi->br_cntr_nr = br_cntr_nr;
+		memcpy(bi->br_cntr, &br_cntr[offset * br_cntr_nr],
+		       br_cntr_nr * sizeof(u64));
+	}
+	bi->evsel = evsel;
 }
 
 int block_info__process_sym(struct hist_entry *he, struct block_hist *bh,
-			    u64 *block_cycles_aggr, u64 total_cycles)
+			    u64 *block_cycles_aggr, u64 total_cycles,
+			    unsigned int br_cntr_nr)
 {
 	struct annotation *notes;
 	struct cyc_hist *ch;
@@ -125,12 +150,14 @@ int block_info__process_sym(struct hist_entry *he, struct block_hist *bh,
 			struct block_info *bi;
 			struct hist_entry *he_block;
 
-			bi = block_info__new();
+			bi = block_info__new(br_cntr_nr);
 			if (!bi)
 				return -1;
 
 			init_block_info(bi, he->ms.sym, &ch[i], i,
-					total_cycles);
+					total_cycles, br_cntr_nr,
+					notes->branch->br_cntr,
+					hists_to_evsel(he->hists));
 			cycles += bi->cycles_aggr / bi->num_aggr;
 
 			he_block = hists__add_entry_block(&bh->block_hists,
@@ -327,6 +354,24 @@ static void init_block_header(struct block_fmt *block_fmt)
 	fmt->width = block_column_width;
 }
 
+static int block_branch_counter_entry(struct perf_hpp_fmt *fmt,
+				      struct perf_hpp *hpp,
+				      struct hist_entry *he)
+{
+	struct block_fmt *block_fmt = container_of(fmt, struct block_fmt, fmt);
+	struct block_info *bi = he->block_info;
+	char *buf;
+	int ret;
+
+	if (annotation_br_cntr_entry(&buf, bi->br_cntr_nr, bi->br_cntr,
+				     bi->num_aggr, bi->evsel))
+		return 0;
+
+	ret = scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf);
+	free(buf);
+	return ret;
+}
+
 static void hpp_register(struct block_fmt *block_fmt, int idx,
 			 struct perf_hpp_list *hpp_list)
 {
@@ -357,6 +402,9 @@ static void hpp_register(struct block_fmt *block_fmt, int idx,
 	case PERF_HPP_REPORT__BLOCK_DSO:
 		fmt->entry = block_dso_entry;
 		break;
+	case PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER:
+		fmt->entry = block_branch_counter_entry;
+		break;
 	default:
 		return;
 	}
@@ -390,7 +438,7 @@ static void init_block_hist(struct block_hist *bh, struct block_fmt *block_fmts,
 static int process_block_report(struct hists *hists,
 				struct block_report *block_report,
 				u64 total_cycles, int *block_hpps,
-				int nr_hpps)
+				int nr_hpps, unsigned int br_cntr_nr)
 {
 	struct rb_node *next = rb_first_cached(&hists->entries);
 	struct block_hist *bh = &block_report->hist;
@@ -405,7 +453,7 @@ static int process_block_report(struct hists *hists,
 	while (next) {
 		he = rb_entry(next, struct hist_entry, rb_node);
 		block_info__process_sym(he, bh, &block_report->cycles,
-					total_cycles);
+					total_cycles, br_cntr_nr);
 		next = rb_next(&he->rb_node);
 	}
 
@@ -435,7 +483,7 @@ struct block_report *block_info__create_report(struct evlist *evlist,
 		struct hists *hists = evsel__hists(pos);
 
 		process_block_report(hists, &block_reports[i], total_cycles,
-				     block_hpps, nr_hpps);
+				     block_hpps, nr_hpps, evlist->nr_br_cntr);
 		i++;
 	}
 
diff --git a/tools/perf/util/block-info.h b/tools/perf/util/block-info.h
index 0b9e1aad4c55..b9329dc3ab59 100644
--- a/tools/perf/util/block-info.h
+++ b/tools/perf/util/block-info.h
@@ -18,6 +18,9 @@ struct block_info {
 	u64			total_cycles;
 	int			num;
 	int			num_aggr;
+	int			br_cntr_nr;
+	u64			*br_cntr;
+	struct evsel		*evsel;
 };
 
 struct block_fmt {
@@ -36,6 +39,7 @@ enum {
 	PERF_HPP_REPORT__BLOCK_AVG_CYCLES,
 	PERF_HPP_REPORT__BLOCK_RANGE,
 	PERF_HPP_REPORT__BLOCK_DSO,
+	PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER,
 	PERF_HPP_REPORT__BLOCK_MAX_INDEX
 };
 
@@ -46,7 +50,6 @@ struct block_report {
 	int			nr_fmts;
 };
 
-struct block_info *block_info__new(void);
 void block_info__delete(struct block_info *bi);
 
 int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right);
@@ -55,7 +58,8 @@ int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 			struct hist_entry *left, struct hist_entry *right);
 
 int block_info__process_sym(struct hist_entry *he, struct block_hist *bh,
-			    u64 *block_cycles_aggr, u64 total_cycles);
+			    u64 *block_cycles_aggr, u64 total_cycles,
+			    unsigned int br_cntr_nr);
 
 struct block_report *block_info__create_report(struct evlist *evlist,
 					       u64 total_cycles,
-- 
cgit v1.2.3


From e6952dcec8302643a8b59751df250d03762429c3 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:06 -0700
Subject: perf annotate: Display the branch counter histogram
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Display the branch counter histogram in the annotation view.

Press 'B' to display the branch counter's abbreviation list as well.

  Samples: 1M of events 'anon group { branch-instructions:ppp, branch-misses }',
  4000 Hz, Event count (approx.):
  f3  /home/sdp/test/tchain_edit [Percent: local period]
  Percent       │ IPC Cycle       Branch Counter (Average IPC: 1.39, IPC Coverage: 29.4%)
                │                                     0000000000401755 <f3>:
    0.00   0.00 │                                       endbr64
                │                                       push    %rbp
                │                                       mov     %rsp,%rbp
                │                                       movl    $0x0,-0x4(%rbp)
    0.00   0.00 │1.33     3          |A   |-   |      ↓ jmp     25
   11.03  11.03 │                                 11:   mov     -0x4(%rbp),%eax
                │                                       and     $0x1,%eax
                │                                       test    %eax,%eax
   17.13  17.13 │2.41     1          |A   |-   |      ↓ je      21
                │                                       addl    $0x1,-0x4(%rbp)
   21.84  21.84 │2.22     2          |AA  |-   |      ↓ jmp     25
   17.13  17.13 │                                 21:   addl    $0x1,-0x4(%rbp)
   21.84  21.84 │                                 25:   cmpl    $0x270f,-0x4(%rbp)
   11.03  11.03 │0.61     3          |A   |-   |      ↑ jle     11
                │                                       nop
                │                                       pop     %rbp
    0.00   0.00 │0.24    20          |AA  |B   |      ← ret

Originally-by: Tinghao Zhang <tinghao.zhang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-8-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c     | 10 +++++++---
 tools/perf/ui/browsers/annotate.c | 18 ++++++++++++++++--
 tools/perf/ui/browsers/hists.c    |  3 ++-
 tools/perf/util/annotate.c        | 40 ++++++++++++++++++++++++++++++++++++---
 tools/perf/util/annotate.h        | 11 +++++++++++
 tools/perf/util/disasm.c          |  1 +
 6 files changed, 74 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 9b4a8a379b5b..3dc6197ef3fa 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -927,11 +927,15 @@ int cmd_annotate(int argc, const char **argv)
 		sort_order = "dso,symbol";
 
 	/*
-	 * Set SORT_MODE__BRANCH so that annotate display IPC/Cycle
-	 * if branch info is in perf data in TUI mode.
+	 * Set SORT_MODE__BRANCH so that annotate displays IPC/Cycle and
+	 * branch counters, if the corresponding branch info is available
+	 * in the perf data in the TUI mode.
 	 */
-	if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack)
+	if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack) {
 		sort__mode = SORT_MODE__BRANCH;
+		if (annotate.session->evlist->nr_br_cntr > 0)
+			annotate_opts.show_br_cntr = true;
+	}
 
 	if (setup_sorting(NULL) < 0)
 		usage_with_options(annotate_usage, options);
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index fe991a81256b..d7e727345dab 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -156,6 +156,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
 	u8 pcnt_width = annotation__pcnt_width(notes);
+	u8 cntr_width = annotation__br_cntr_width();
 	int width;
 	int diff = 0;
 
@@ -205,13 +206,13 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 
 	ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS);
 	__ui_browser__line_arrow(browser,
-				 pcnt_width + 2 + notes->src->widths.addr + width,
+				 pcnt_width + 2 + notes->src->widths.addr + width + cntr_width,
 				 from, to);
 
 	diff = is_fused(ab, cursor);
 	if (diff > 0) {
 		ui_browser__mark_fused(browser,
-				       pcnt_width + 3 + notes->src->widths.addr + width,
+				       pcnt_width + 3 + notes->src->widths.addr + width + cntr_width,
 				       from - diff, diff, to > from);
 	}
 }
@@ -714,6 +715,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
 	struct annotation *notes = symbol__annotation(ms->sym);
 	const char *help = "Press 'h' for help on key bindings";
 	int delay_secs = hbt ? hbt->refresh : 0;
+	char *br_cntr_text = NULL;
 	char title[256];
 	int key;
 
@@ -730,6 +732,8 @@ static int annotate_browser__run(struct annotate_browser *browser,
 
 	nd = browser->curr_hot;
 
+	annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false);
+
 	while (1) {
 		key = ui_browser__run(&browser->b, delay_secs);
 
@@ -796,6 +800,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
 		"r             Run available scripts\n"
 		"p             Toggle percent type [local/global]\n"
 		"b             Toggle percent base [period/hits]\n"
+		"B             Branch counter abbr list (Optional)\n"
 		"?             Search string backwards\n"
 		"f             Toggle showing offsets to full address\n");
 			continue;
@@ -904,6 +909,14 @@ show_sup_ins:
 			hists__scnprintf_title(hists, title, sizeof(title));
 			annotate_browser__show(&browser->b, title, help);
 			continue;
+		case 'B':
+			if (br_cntr_text)
+				ui_browser__help_window(&browser->b, br_cntr_text);
+			else {
+				ui_browser__help_window(&browser->b,
+							"\n The branch counter is not available.\n");
+			}
+			continue;
 		case 'f':
 			annotation__toggle_full_addr(notes, ms);
 			continue;
@@ -923,6 +936,7 @@ show_sup_ins:
 	}
 out:
 	ui_browser__hide(&browser->b);
+	free(br_cntr_text);
 	return key;
 }
 
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 970f7f349298..49ba82bf3391 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -3705,7 +3705,8 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel,
 
 	memset(&action, 0, sizeof(action));
 
-	annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false);
+	if (!annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false))
+		annotate_opts.show_br_cntr = true;
 
 	while (1) {
 		key = hist_browser__run(browser, "? - help", true, 0);
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 5200cafe20e7..4990c70b1794 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -501,8 +501,10 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64
 	}
 }
 
-static int annotation__compute_ipc(struct annotation *notes, size_t size)
+static int annotation__compute_ipc(struct annotation *notes, size_t size,
+				   struct evsel *evsel)
 {
+	unsigned int br_cntr_nr = evsel->evlist->nr_br_cntr;
 	int err = 0;
 	s64 offset;
 
@@ -537,6 +539,20 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size)
 				al->cycles->max = ch->cycles_max;
 				al->cycles->min = ch->cycles_min;
 			}
+			if (al && notes->branch->br_cntr) {
+				if (!al->br_cntr) {
+					al->br_cntr = calloc(br_cntr_nr, sizeof(u64));
+					if (!al->br_cntr) {
+						err = ENOMEM;
+						break;
+					}
+				}
+				al->num_aggr = ch->num_aggr;
+				al->br_cntr_nr = br_cntr_nr;
+				al->evsel = evsel;
+				memcpy(al->br_cntr, &notes->branch->br_cntr[offset * br_cntr_nr],
+				       br_cntr_nr * sizeof(u64));
+			}
 		}
 	}
 
@@ -548,8 +564,10 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size)
 				struct annotation_line *al;
 
 				al = annotated_source__get_line(notes->src, offset);
-				if (al)
+				if (al) {
 					zfree(&al->cycles);
+					zfree(&al->br_cntr);
+				}
 			}
 		}
 	}
@@ -1960,6 +1978,22 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati
 					    "Cycle(min/max)");
 		}
 
+		if (annotate_opts.show_br_cntr) {
+			if (show_title) {
+				obj__printf(obj, "%*s ",
+					    ANNOTATION__BR_CNTR_WIDTH,
+					    "Branch Counter");
+			} else {
+				char *buf;
+
+				if (!annotation_br_cntr_entry(&buf, al->br_cntr_nr, al->br_cntr,
+							      al->num_aggr, al->evsel)) {
+					obj__printf(obj, "%*s ", ANNOTATION__BR_CNTR_WIDTH, buf);
+					free(buf);
+				}
+			}
+		}
+
 		if (show_title && !*al->line) {
 			ipc_coverage_string(bf, sizeof(bf), notes);
 			obj__printf(obj, "%*s", ANNOTATION__AVG_IPC_WIDTH, bf);
@@ -2056,7 +2090,7 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel,
 	annotation__set_index(notes);
 	annotation__mark_jump_targets(notes, sym);
 
-	err = annotation__compute_ipc(notes, size);
+	err = annotation__compute_ipc(notes, size, evsel);
 	if (err)
 		return err;
 
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 4fdaa3d5e8d2..8b9e05a1932f 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -31,6 +31,7 @@ struct annotated_data_type;
 #define ANNOTATION__CYCLES_WIDTH 6
 #define ANNOTATION__MINMAX_CYCLES_WIDTH 19
 #define ANNOTATION__AVG_IPC_WIDTH 36
+#define ANNOTATION__BR_CNTR_WIDTH 30
 #define ANNOTATION_DUMMY_LEN	256
 
 struct annotation_options {
@@ -44,6 +45,7 @@ struct annotation_options {
 	     show_nr_jumps,
 	     show_minmax_cycle,
 	     show_asm_raw,
+	     show_br_cntr,
 	     annotate_src,
 	     full_addr;
 	u8   offset_level;
@@ -104,6 +106,10 @@ struct annotation_line {
 	char			*fileloc;
 	char			*path;
 	struct cycles_info	*cycles;
+	int			 num_aggr;
+	int			 br_cntr_nr;
+	u64			*br_cntr;
+	struct evsel		*evsel;
 	int			 jump_sources;
 	u32			 idx;
 	int			 idx_asm;
@@ -353,6 +359,11 @@ static inline bool annotation_line__filter(struct annotation_line *al)
 	return annotate_opts.hide_src_code && al->offset == -1;
 }
 
+static inline u8 annotation__br_cntr_width(void)
+{
+	return annotate_opts.show_br_cntr ? ANNOTATION__BR_CNTR_WIDTH : 0;
+}
+
 void annotation__update_column_widths(struct annotation *notes);
 void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms);
 
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 226d2181f694..d11e75133b23 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -1015,6 +1015,7 @@ static void annotation_line__exit(struct annotation_line *al)
 	zfree_srcline(&al->path);
 	zfree(&al->line);
 	zfree(&al->cycles);
+	zfree(&al->br_cntr);
 }
 
 static size_t disasm_line_size(int nr)
-- 
cgit v1.2.3


From 6f9d8d1de2c61288edd7dfc2a7e7986c8b2ae927 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:07 -0700
Subject: perf script: Add branch counters

It's useful to print the branch counter information for each jump in
the brstackinsn when it's available.

Add a new field 'brcntr' to display the branch counter information.

By default, the abbreviation will be used to indicate the branch
counter. In the verbose mode, the real event name is shown.

  $ perf script -F +brstackinsn,+brcntr

  # Branch counter abbr list:
  # branch-instructions:ppp = A
  # branch-misses = B
  # '-' No event occurs
  # '+' Event occurrences may be lost due to branch counter saturated
      tchain_edit  332203 3366329.405674:      53030 branch-instructions:ppp:            401781 f3+0x2c (home/sdp/test/tchain_edit)
         f3+31:
         0000000000401774        insn: eb 04                     br_cntr: AA     # PRED 5 cycles [5]
         000000000040177a        insn: 81 7d fc 0f 27 00 00
         0000000000401781        insn: 7e e3                     br_cntr: A      # PRED 1 cycles [6] 2.00 IPC
         0000000000401766        insn: 8b 45 fc
         0000000000401769        insn: 83 e0 01
         000000000040176c        insn: 85 c0
         000000000040176e        insn: 74 06                     br_cntr: A      # PRED 1 cycles [7] 4.00 IPC
         0000000000401776        insn: 83 45 fc 01
         000000000040177a        insn: 81 7d fc 0f 27 00 00
         0000000000401781        insn: 7e e3                     br_cntr: A      # PRED 7 cycles [14] 0.43 IPC

  $ perf script -F +brstackinsn,+brcntr -v

     tchain_edit  332203 3366329.405674:      53030 branch-instructions:ppp:            401781 f3+0x2c (/home/sdp/os.linux.perf.test-suite/kernels/lbr_kernel/tchain_edit)
        f3+31:
        0000000000401774        insn: eb 04                     br_cntr: branch-instructions:ppp 2 branch-misses 0      # PRED 5 cycles [5]
        000000000040177a        insn: 81 7d fc 0f 27 00 00
        0000000000401781        insn: 7e e3                     br_cntr: branch-instructions:ppp 1 branch-misses 0      # PRED 1 cycles [6] 2.00 IPC
        0000000000401766        insn: 8b 45 fc
        0000000000401769        insn: 83 e0 01
        000000000040176c        insn: 85 c0
        000000000040176e        insn: 74 06                     br_cntr: branch-instructions:ppp 1 branch-misses 0      # PRED 1 cycles [7] 4.00 IPC
        0000000000401776        insn: 83 45 fc 01
        000000000040177a        insn: 81 7d fc 0f 27 00 00
        0000000000401781        insn: 7e e3                     br_cntr: branch-instructions:ppp 1 branch-misses 0      # PRED 7 cycles [14] 0.43 IPC

Originally-by: Tinghao Zhang <tinghao.zhang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-9-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-script.txt |  2 +-
 tools/perf/builtin-script.c              | 69 ++++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 5abb960c4960..b72866ef270b 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -134,7 +134,7 @@ OPTIONS
         srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output,
         brstackinsn, brstackinsnlen, brstackdisasm, brstackoff, callindent, insn, disasm,
         insnlen, synth, phys_addr, metric, misc, srccode, ipc, data_page_size,
-        code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat,
+        code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat, brcntr,
 
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 54598d1e815a..206b08426555 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -62,6 +62,7 @@
 #include "util/record.h"
 #include "util/util.h"
 #include "util/cgroup.h"
+#include "util/annotate.h"
 #include "perf.h"
 
 #include <linux/ctype.h>
@@ -138,6 +139,7 @@ enum perf_output_field {
 	PERF_OUTPUT_DSOFF           = 1ULL << 41,
 	PERF_OUTPUT_DISASM          = 1ULL << 42,
 	PERF_OUTPUT_BRSTACKDISASM   = 1ULL << 43,
+	PERF_OUTPUT_BRCNTR          = 1ULL << 44,
 };
 
 struct perf_script {
@@ -213,6 +215,7 @@ struct output_option {
 	{.str = "cgroup", .field = PERF_OUTPUT_CGROUP},
 	{.str = "retire_lat", .field = PERF_OUTPUT_RETIRE_LAT},
 	{.str = "brstackdisasm", .field = PERF_OUTPUT_BRSTACKDISASM},
+	{.str = "brcntr", .field = PERF_OUTPUT_BRCNTR},
 };
 
 enum {
@@ -520,6 +523,12 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session)
 		       "Hint: run 'perf record -b ...'\n");
 		return -EINVAL;
 	}
+	if (PRINT_FIELD(BRCNTR) &&
+	    !(evlist__combined_branch_type(session->evlist) & PERF_SAMPLE_BRANCH_COUNTERS)) {
+		pr_err("Display of branch counter requested but it's not enabled\n"
+		       "Hint: run 'perf record -j any,counter ...'\n");
+		return -EINVAL;
+	}
 	if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
 	    evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID", PERF_OUTPUT_TID|PERF_OUTPUT_PID))
 		return -EINVAL;
@@ -789,6 +798,19 @@ static int perf_sample__fprintf_start(struct perf_script *script,
 	int printed = 0;
 	char tstr[128];
 
+	/*
+	 * Print the branch counter's abbreviation list,
+	 * if the branch counter is available.
+	 */
+	if (PRINT_FIELD(BRCNTR) && !verbose) {
+		char *buf;
+
+		if (!annotation_br_cntr_abbr_list(&buf, evsel, true)) {
+			printed += fprintf(stdout, "%s", buf);
+			free(buf);
+		}
+	}
+
 	if (PRINT_FIELD(MACHINE_PID) && sample->machine_pid)
 		printed += fprintf(fp, "VM:%5d ", sample->machine_pid);
 
@@ -1195,7 +1217,9 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en,
 			    struct perf_insn *x, u8 *inbuf, int len,
 			    int insn, FILE *fp, int *total_cycles,
 			    struct perf_event_attr *attr,
-			    struct thread *thread)
+			    struct thread *thread,
+			    struct evsel *evsel,
+			    u64 br_cntr)
 {
 	int ilen = 0;
 	int printed = fprintf(fp, "\t%016" PRIx64 "\t", ip);
@@ -1216,6 +1240,28 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en,
 		addr_location__exit(&al);
 	}
 
+	if (PRINT_FIELD(BRCNTR)) {
+		unsigned int width = evsel__env(evsel)->br_cntr_width;
+		unsigned int i = 0, j, num, mask = (1L << width) - 1;
+		struct evsel *pos = evsel__leader(evsel);
+
+		printed += fprintf(fp, "br_cntr: ");
+		evlist__for_each_entry_from(evsel->evlist, pos) {
+			if (!(pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS))
+				continue;
+			if (evsel__leader(pos) != evsel__leader(evsel))
+				break;
+
+			num = (br_cntr >> (i++ * width)) & mask;
+			if (!verbose) {
+				for (j = 0; j < num; j++)
+					printed += fprintf(fp, "%s", pos->abbr_name);
+			} else
+				printed += fprintf(fp, "%s %d ", pos->name, num);
+		}
+		printed += fprintf(fp, "\t");
+	}
+
 	printed += fprintf(fp, "#%s%s%s%s",
 			      en->flags.predicted ? " PRED" : "",
 			      en->flags.mispred ? " MISPRED" : "",
@@ -1272,6 +1318,7 @@ out:
 }
 
 static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
+					    struct evsel *evsel,
 					    struct thread *thread,
 					    struct perf_event_attr *attr,
 					    struct machine *machine, FILE *fp)
@@ -1285,6 +1332,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 	unsigned off;
 	struct symbol *lastsym = NULL;
 	int total_cycles = 0;
+	u64 br_cntr = 0;
 
 	if (!(br && br->nr))
 		return 0;
@@ -1296,6 +1344,9 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 	x.machine = machine;
 	x.cpu = sample->cpu;
 
+	if (PRINT_FIELD(BRCNTR) && sample->branch_stack_cntr)
+		br_cntr = sample->branch_stack_cntr[nr - 1];
+
 	printed += fprintf(fp, "%c", '\n');
 
 	/* Handle first from jump, of which we don't know the entry. */
@@ -1307,7 +1358,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 					   x.cpumode, x.cpu, &lastsym, attr, fp);
 		printed += ip__fprintf_jump(entries[nr - 1].from, &entries[nr - 1],
 					    &x, buffer, len, 0, fp, &total_cycles,
-					    attr, thread);
+					    attr, thread, evsel, br_cntr);
 		if (PRINT_FIELD(SRCCODE))
 			printed += print_srccode(thread, x.cpumode, entries[nr - 1].from);
 	}
@@ -1337,8 +1388,10 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 
 			printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp);
 			if (ip == end) {
+				if (PRINT_FIELD(BRCNTR) && sample->branch_stack_cntr)
+					br_cntr = sample->branch_stack_cntr[i];
 				printed += ip__fprintf_jump(ip, &entries[i], &x, buffer + off, len - off, ++insn, fp,
-							    &total_cycles, attr, thread);
+							    &total_cycles, attr, thread, evsel, br_cntr);
 				if (PRINT_FIELD(SRCCODE))
 					printed += print_srccode(thread, x.cpumode, ip);
 				break;
@@ -1547,6 +1600,7 @@ void script_fetch_insn(struct perf_sample *sample, struct thread *thread,
 }
 
 static int perf_sample__fprintf_insn(struct perf_sample *sample,
+				     struct evsel *evsel,
 				     struct perf_event_attr *attr,
 				     struct thread *thread,
 				     struct machine *machine, FILE *fp,
@@ -1567,7 +1621,7 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample,
 		printed += sample__fprintf_insn_asm(sample, thread, machine, fp, al);
 	}
 	if (PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN) || PRINT_FIELD(BRSTACKDISASM))
-		printed += perf_sample__fprintf_brstackinsn(sample, thread, attr, machine, fp);
+		printed += perf_sample__fprintf_brstackinsn(sample, evsel, thread, attr, machine, fp);
 
 	return printed;
 }
@@ -1639,7 +1693,7 @@ static int perf_sample__fprintf_bts(struct perf_sample *sample,
 	if (print_srcline_last)
 		printed += map__fprintf_srcline(al->map, al->addr, "\n  ", fp);
 
-	printed += perf_sample__fprintf_insn(sample, attr, thread, machine, fp, al);
+	printed += perf_sample__fprintf_insn(sample, evsel, attr, thread, machine, fp, al);
 	printed += fprintf(fp, "\n");
 	if (PRINT_FIELD(SRCCODE)) {
 		int ret = map__fprintf_srccode(al->map, al->addr, stdout,
@@ -2297,7 +2351,7 @@ static void process_event(struct perf_script *script,
 
 	if (evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
 		perf_sample__fprintf_bpf_output(sample, fp);
-	perf_sample__fprintf_insn(sample, attr, thread, machine, fp, al);
+	perf_sample__fprintf_insn(sample, evsel, attr, thread, machine, fp, al);
 
 	if (PRINT_FIELD(PHYS_ADDR))
 		fprintf(fp, "%16" PRIx64, sample->phys_addr);
@@ -3947,7 +4001,8 @@ int cmd_script(int argc, const char **argv)
 		     "brstacksym,flags,data_src,weight,bpf-output,brstackinsn,"
 		     "brstackinsnlen,brstackdisasm,brstackoff,callindent,insn,disasm,insnlen,synth,"
 		     "phys_addr,metric,misc,srccode,ipc,tod,data_page_size,"
-		     "code_page_size,ins_lat,machine_pid,vcpu,cgroup,retire_lat",
+		     "code_page_size,ins_lat,machine_pid,vcpu,cgroup,retire_lat,"
+		     "brcntr",
 		     parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
-- 
cgit v1.2.3


From dab5b6cb0d405425502e60450ecf1c8e5ea6b845 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 13 Aug 2024 09:02:08 -0700
Subject: perf test: Add new test cases for the branch counter feature

Enhance the test case for the branch counter feature.

Now, the test verifies:

- The new filter can be successfully applied on the supported platforms.
- The counter value can be outputted via the perf report -D
- The counter value and the abbr name can be outputted via the
  perf script (New)

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240813160208.2493643-10-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record.sh | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh
index 3d1a7759a7b2..7964ebd9007d 100755
--- a/tools/perf/tests/shell/record.sh
+++ b/tools/perf/tests/shell/record.sh
@@ -21,6 +21,7 @@ testprog="perf test -w thloop"
 cpu_pmu_dir="/sys/bus/event_source/devices/cpu*"
 br_cntr_file="/caps/branch_counter_nr"
 br_cntr_output="branch stack counters"
+br_cntr_script_output="br_cntr: A"
 
 cleanup() {
   rm -rf "${perfdata}"
@@ -165,7 +166,7 @@ test_workload() {
 }
 
 test_branch_counter() {
-  echo "Basic branch counter test"
+  echo "Branch counter test"
   # Check if the branch counter feature is supported
   for dir in $cpu_pmu_dir
   do
@@ -175,19 +176,25 @@ test_branch_counter() {
       return
     fi
   done
-  if ! perf record -o "${perfdata}" -j any,counter ${testprog} 2> /dev/null
+  if ! perf record -o "${perfdata}" -e "{branches:p,instructions}" -j any,counter ${testprog} 2> /dev/null
   then
-    echo "Basic branch counter test [Failed record]"
+    echo "Branch counter record test [Failed record]"
     err=1
     return
   fi
   if ! perf report -i "${perfdata}" -D -q | grep -q "$br_cntr_output"
   then
-    echo "Basic branch record test [Failed missing output]"
+    echo "Branch counter report test [Failed missing output]"
     err=1
     return
   fi
-  echo "Basic branch counter test [Success]"
+  if ! perf script -i "${perfdata}" -F +brstackinsn,+brcntr | grep -q "$br_cntr_script_output"
+  then
+    echo " Branch counter script test [Failed missing output]"
+    err=1
+    return
+  fi
+  echo "Branch counter test [Success]"
 }
 
 test_per_thread
-- 
cgit v1.2.3


From 4c55560f23d19051adc7e76818687a88448bef83 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 14 Aug 2024 10:36:24 -0300
Subject: perf build: Fix up broken capstone feature detection fast path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The capstone devel headers define 'struct bpf_insn' in a way that clashes with
what is in the libbpf devel headers, so we so far need to avoid including both.

This is happening on the tools/build/feature/test-all.c file, where we try
building all the expected set of libraries to be normally available on a
system:

  ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.make.output
  In file included from test-bpf.c:3,
                   from test-all.c:150:
  /home/acme/git/perf-tools-next/tools/include/uapi/linux/bpf.h:77:8: error: ‘bpf_insn’ defined as wrong kind of tag
     77 | struct bpf_insn {
        |        ^~~~~~~~
  ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.make.output

When doing so there is a trick where we define main to be
main_test_libcapstone, then include the individual
tools/build/feture/test-libcapstone.c capability query test, and then we undef
'main' because we'll do it all over again with the next expected library to
be tested (at this time 'lzma').

To complete this mechanism we need to, in test-all.c 'main' routine, to
call main_test_libcapstone(), which isn't being done, so the effect of
adding references to capstone in test-all.c are not achieved.

The only thing that is happening is that test-all.c is failing to build and thus
all the tests will have to be done individually, which nullifies the test-all.c
single build speedup.

So lets remove references to capstone from test-all.c to see if this makes it
build again so that we get faster builds or go on fixing up whatever is
preventing us to get that benefit.

Nothing: after this fix we get a clean test-all.c build and get the build speedup back:

  ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.make.output
  ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.
  test-all.bin          test-all.d            test-all.make.output
  ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.make.output
  ⬢[acme@toolbox perf-tools-next]$ ldd /tmp/build/perf-tools-next/feature/test-all.bin
  	linux-vdso.so.1 (0x00007f13277a1000)
  	libpython3.12.so.1.0 => /lib64/libpython3.12.so.1.0 (0x00007f1326e00000)
  	libm.so.6 => /lib64/libm.so.6 (0x00007f13274be000)
  	libtraceevent.so.1 => /lib64/libtraceevent.so.1 (0x00007f1327496000)
  	libtracefs.so.1 => /lib64/libtracefs.so.1 (0x00007f132746f000)
  	libcrypto.so.3 => /lib64/libcrypto.so.3 (0x00007f1326800000)
  	libunwind-x86_64.so.8 => /lib64/libunwind-x86_64.so.8 (0x00007f1327452000)
  	libunwind.so.8 => /lib64/libunwind.so.8 (0x00007f1327436000)
  	liblzma.so.5 => /lib64/liblzma.so.5 (0x00007f1327403000)
  	libdw.so.1 => /lib64/libdw.so.1 (0x00007f1326d6f000)
  	libz.so.1 => /lib64/libz.so.1 (0x00007f13273e2000)
  	libelf.so.1 => /lib64/libelf.so.1 (0x00007f1326d53000)
  	libnuma.so.1 => /lib64/libnuma.so.1 (0x00007f13273d4000)
  	libslang.so.2 => /lib64/libslang.so.2 (0x00007f1326400000)
  	libperl.so.5.38 => /lib64/libperl.so.5.38 (0x00007f1326000000)
  	libc.so.6 => /lib64/libc.so.6 (0x00007f1325e0f000)
  	libzstd.so.1 => /lib64/libzstd.so.1 (0x00007f1326741000)
  	/lib64/ld-linux-x86-64.so.2 (0x00007f13277a3000)
  	libbz2.so.1 => /lib64/libbz2.so.1 (0x00007f1326d3f000)
  	libcrypt.so.2 => /lib64/libcrypt.so.2 (0x00007f1326d07000)
  ⬢[acme@toolbox perf-tools-next]$

And when having capstone-devel installed we get it detected and linked with
perf, allowing us to benefit from the features that it enables:

  ⬢[acme@toolbox perf-tools-next]$ rpm -q capstone-devel
  capstone-devel-5.0.1-3.fc40.x86_64
  ⬢[acme@toolbox perf-tools-next]$ ldd /tmp/build/perf-tools-next/perf | grep capstone
  	libcapstone.so.5 => /lib64/libcapstone.so.5 (0x00007fe6a5c00000)
  ⬢[acme@toolbox perf-tools-next]$ /tmp/build/perf-tools-next/perf -vv | grep cap
             libcapstone: [ on  ]  # HAVE_LIBCAPSTONE_SUPPORT
  ⬢[acme@toolbox perf-tools-next]$

Fixes: 8b767db3309595a2 ("perf: build: introduce the libcapstone")
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/Zry0sepD5Ppa5YKP@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/feature/test-all.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index dd0a18c2ef8f..6f4bf386a3b5 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -134,10 +134,6 @@
 #undef main
 #endif
 
-#define main main_test_libcapstone
-# include "test-libcapstone.c"
-#undef main
-
 #define main main_test_lzma
 # include "test-lzma.c"
 #undef main
-- 
cgit v1.2.3


From 27ac597c0e2f596f53963db4a2cf53cd84657cda Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Mon, 29 Apr 2024 10:53:07 +0200
Subject: perf test record.sh: Raise limit of open file descriptors

Subtest for system-wide record with '--threads=cpu' option fails due
to a limit of open file descriptors on systems with 128 or more CPUs
as the default limit is set to 1024.

The number of open file descriptors should be slightly above
nmb_events*nmb_cpus + nmb_cpus(for perf.data.n) + 4*nmb_cpus(for pipes),
which equals 8*nmb_cpus. Therefore, temporarily raise the limit to
16*nmb_cpus for the test.

Committer notes:

Instead of disabling ShellCheck warnings all the uses of 'uname -n',
i.e. those:

  In tests/shell/record.sh line 35:
  default_fd_limit=$(ulimit -Sn)
                            ^-^ SC3045 (warning): In POSIX sh, ulimit -S is undefined.

We can just switch from using '/bin/sh' to '/bin/bash' for this test, as
bash _has_ 'ulimit -n', so ShellCheck will not emit that warning.

There are dozens of 'perf test' shell tests that do just that,
'/bin/bash' is a reasonable expectation for those tests.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Radostin Stoyanov <rstoyano@redhat.com>
Link: https://lore.kernel.org/linux-perf-users/20240429085721.10122-1-vmolnaro@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record.sh | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh
index 7964ebd9007d..36883b03169f 100755
--- a/tools/perf/tests/shell/record.sh
+++ b/tools/perf/tests/shell/record.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # perf record tests
 # SPDX-License-Identifier: GPL-2.0
 
@@ -23,6 +23,15 @@ br_cntr_file="/caps/branch_counter_nr"
 br_cntr_output="branch stack counters"
 br_cntr_script_output="br_cntr: A"
 
+default_fd_limit=$(ulimit -Sn)
+# With option --threads=cpu the number of open file descriptors should be
+# equal to sum of:    nmb_cpus * nmb_events (2+dummy),
+#                     nmb_threads for perf.data.n (equal to nmb_cpus) and
+#                     2*nmb_cpus of pipes = 4*nmb_cpus (each pipe has 2 ends)
+# All together it needs 8*nmb_cpus file descriptors plus some are also used
+# outside of testing, thus raising the limit to 16*nmb_cpus
+min_fd_limit=$(($(getconf _NPROCESSORS_ONLN) * 16))
+
 cleanup() {
   rm -rf "${perfdata}"
   rm -rf "${perfdata}".old
@@ -197,11 +206,19 @@ test_branch_counter() {
   echo "Branch counter test [Success]"
 }
 
+# raise the limit of file descriptors to minimum
+if [[ $default_fd_limit -lt $min_fd_limit ]]; then
+       ulimit -Sn $min_fd_limit
+fi
+
 test_per_thread
 test_register_capture
 test_system_wide
 test_workload
 test_branch_counter
 
+# restore the default value
+ulimit -Sn $default_fd_limit
+
 cleanup
 exit $err
-- 
cgit v1.2.3


From a4ae5c31e0f28d2c737583a06d4b29ffd6bbd622 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Tue, 13 Aug 2024 14:45:05 +0200
Subject: selftests/bpf: convert get_current_cgroup_id_user to test_progs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

get_current_cgroup_id_user allows testing for bpf_get_current_cgroup_id()
bpf API but is not integrated into test_progs, and so is not tested
automatically in CI.

Convert it to the test_progs framework to allow running it automatically.
The most notable differences with the old test are the following:
- the new test relies on autoattach instead of manually hooking/enabling
  the targeted tracepoint through perf_event, which reduces quite a lot the
  test code size
- it also accesses bpf prog data through global variables instead of maps
- sleep duration passed to nanosleep syscall has been reduced to its
  minimum to not impact overall CI duration (we only care about the syscall
  being properly triggered, not about the passed duration)

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-1-a33c03458cf6@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore             |   1 -
 tools/testing/selftests/bpf/Makefile               |   3 +-
 tools/testing/selftests/bpf/get_cgroup_id_user.c   | 151 ---------------------
 .../bpf/prog_tests/cgroup_get_current_cgroup_id.c  |  46 +++++++
 .../selftests/bpf/progs/get_cgroup_id_kern.c       |  26 +---
 5 files changed, 51 insertions(+), 176 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/get_cgroup_id_user.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 8f14d8faeb0b..7d4d5d3e0210 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -19,7 +19,6 @@ test_sock
 urandom_read
 test_sockmap
 test_lirc_mode2_user
-get_cgroup_id_user
 test_skb_cgroup_id_user
 test_cgroup_storage
 test_flow_dissector
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f54185e96a95..8beade034caf 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -67,7 +67,7 @@ endif
 
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-	test_sock test_sockmap get_cgroup_id_user \
+	test_sock test_sockmap \
 	test_cgroup_storage \
 	test_tcpnotify_user test_sysctl \
 	test_progs-no_alu32
@@ -295,7 +295,6 @@ $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS)
-$(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_cgroup_storage: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS)
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
deleted file mode 100644
index aefd83ebdcd7..000000000000
--- a/tools/testing/selftests/bpf/get_cgroup_id_user.c
+++ /dev/null
@@ -1,151 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2018 Facebook
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <syscall.h>
-#include <unistd.h>
-#include <linux/perf_event.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <linux/bpf.h>
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "cgroup_helpers.h"
-#include "testing_helpers.h"
-
-#define CHECK(condition, tag, format...) ({		\
-	int __ret = !!(condition);			\
-	if (__ret) {					\
-		printf("%s:FAIL:%s ", __func__, tag);	\
-		printf(format);				\
-	} else {					\
-		printf("%s:PASS:%s\n", __func__, tag);	\
-	}						\
-	__ret;						\
-})
-
-static int bpf_find_map(const char *test, struct bpf_object *obj,
-			const char *name)
-{
-	struct bpf_map *map;
-
-	map = bpf_object__find_map_by_name(obj, name);
-	if (!map)
-		return -1;
-	return bpf_map__fd(map);
-}
-
-#define TEST_CGROUP "/test-bpf-get-cgroup-id/"
-
-int main(int argc, char **argv)
-{
-	const char *probe_name = "syscalls/sys_enter_nanosleep";
-	const char *file = "get_cgroup_id_kern.bpf.o";
-	int err, bytes, efd, prog_fd, pmu_fd;
-	int cgroup_fd, cgidmap_fd, pidmap_fd;
-	struct perf_event_attr attr = {};
-	struct bpf_object *obj;
-	__u64 kcgid = 0, ucgid;
-	__u32 key = 0, pid;
-	int exit_code = 1;
-	char buf[256];
-	const struct timespec req = {
-		.tv_sec = 1,
-		.tv_nsec = 0,
-	};
-
-	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
-	if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno))
-		return 1;
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "bpf_prog_test_load", "err %d errno %d\n", err, errno))
-		goto cleanup_cgroup_env;
-
-	cgidmap_fd = bpf_find_map(__func__, obj, "cg_ids");
-	if (CHECK(cgidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
-		  cgidmap_fd, errno))
-		goto close_prog;
-
-	pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
-	if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
-		  pidmap_fd, errno))
-		goto close_prog;
-
-	pid = getpid();
-	bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
-
-	if (access("/sys/kernel/tracing/trace", F_OK) == 0) {
-		snprintf(buf, sizeof(buf),
-			 "/sys/kernel/tracing/events/%s/id", probe_name);
-	} else {
-		snprintf(buf, sizeof(buf),
-			 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
-	}
-	efd = open(buf, O_RDONLY, 0);
-	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
-		goto close_prog;
-	bytes = read(efd, buf, sizeof(buf));
-	close(efd);
-	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
-		  "bytes %d errno %d\n", bytes, errno))
-		goto close_prog;
-
-	attr.config = strtol(buf, NULL, 0);
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.sample_type = PERF_SAMPLE_RAW;
-	attr.sample_period = 1;
-	attr.wakeup_events = 1;
-
-	/* attach to this pid so the all bpf invocations will be in the
-	 * cgroup associated with this pid.
-	 */
-	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
-	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
-		  errno))
-		goto close_prog;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
-	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
-		  errno))
-		goto close_pmu;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
-	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
-		  errno))
-		goto close_pmu;
-
-	/* trigger some syscalls */
-	syscall(__NR_nanosleep, &req, NULL);
-
-	err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid);
-	if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno))
-		goto close_pmu;
-
-	ucgid = get_cgroup_id(TEST_CGROUP);
-	if (CHECK(kcgid != ucgid, "compare_cgroup_id",
-		  "kern cgid %llx user cgid %llx", kcgid, ucgid))
-		goto close_pmu;
-
-	exit_code = 0;
-	printf("%s:PASS\n", argv[0]);
-
-close_pmu:
-	close(pmu_fd);
-close_prog:
-	bpf_object__close(obj);
-cleanup_cgroup_env:
-	cleanup_cgroup_environment();
-	return exit_code;
-}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c b/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c
new file mode 100644
index 000000000000..7a1643b03bf3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+#include "get_cgroup_id_kern.skel.h"
+
+#define TEST_CGROUP "/test-bpf-get-cgroup-id/"
+
+void test_cgroup_get_current_cgroup_id(void)
+{
+	struct get_cgroup_id_kern *skel;
+	const struct timespec req = {
+		.tv_sec = 0,
+		.tv_nsec = 1,
+	};
+	int cgroup_fd;
+	__u64 ucgid;
+
+	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+	if (!ASSERT_OK_FD(cgroup_fd, "cgroup switch"))
+		return;
+
+	skel = get_cgroup_id_kern__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "load program"))
+		goto cleanup_cgroup;
+
+	if (!ASSERT_OK(get_cgroup_id_kern__attach(skel), "attach bpf program"))
+		goto cleanup_progs;
+
+	skel->bss->expected_pid = getpid();
+	/* trigger the syscall on which is attached the tested prog */
+	if (!ASSERT_OK(syscall(__NR_nanosleep, &req, NULL), "nanosleep"))
+		goto cleanup_progs;
+
+	ucgid = get_cgroup_id(TEST_CGROUP);
+
+	ASSERT_EQ(skel->bss->cg_id, ucgid, "compare cgroup ids");
+
+cleanup_progs:
+	get_cgroup_id_kern__destroy(skel);
+cleanup_cgroup:
+	close(cgroup_fd);
+	cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c
index 68587b1de34e..30fd504856c7 100644
--- a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c
+++ b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c
@@ -4,34 +4,16 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__uint(max_entries, 1);
-	__type(key, __u32);
-	__type(value, __u64);
-} cg_ids SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__uint(max_entries, 1);
-	__type(key, __u32);
-	__type(value, __u32);
-} pidmap SEC(".maps");
+__u64 cg_id;
+__u64 expected_pid;
 
 SEC("tracepoint/syscalls/sys_enter_nanosleep")
 int trace(void *ctx)
 {
 	__u32 pid = bpf_get_current_pid_tgid();
-	__u32 key = 0, *expected_pid;
-	__u64 *val;
-
-	expected_pid = bpf_map_lookup_elem(&pidmap, &key);
-	if (!expected_pid || *expected_pid != pid)
-		return 0;
 
-	val = bpf_map_lookup_elem(&cg_ids, &key);
-	if (val)
-		*val = bpf_get_current_cgroup_id();
+	if (expected_pid == pid)
+		cg_id = bpf_get_current_cgroup_id();
 
 	return 0;
 }
-- 
cgit v1.2.3


From 37a14cfd667a22662867b986925f1ad4205c17df Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Tue, 13 Aug 2024 14:45:06 +0200
Subject: selftests/bpf: convert test_cgroup_storage to test_progs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_cgroup_storage is currently a standalone program which is not run
when executing test_progs.

Convert it to the test_progs framework so it can be automatically executed
in CI. The conversion led to the following changes:
- converted the raw bpf program in the userspace test file into a dedicated
  test program in progs/ dir
- reduced the scope of cgroup_storage test: the content from this test
  overlaps with some other tests already present in test_progs, most
  notably netcnt and cgroup_storage_multi*. Those tests already check
  extensively local storage, per-cpu local storage, cgroups interaction,
  etc. So the new test only keep the part testing that the program return
  code (based on map content) properly leads to packet being passed or
  dropped.

Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-2-a33c03458cf6@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore             |   1 -
 tools/testing/selftests/bpf/Makefile               |   2 -
 .../selftests/bpf/prog_tests/cgroup_storage.c      |  96 ++++++++++++
 tools/testing/selftests/bpf/progs/cgroup_storage.c |  24 +++
 tools/testing/selftests/bpf/test_cgroup_storage.c  | 174 ---------------------
 5 files changed, 120 insertions(+), 177 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_storage.c
 create mode 100644 tools/testing/selftests/bpf/progs/cgroup_storage.c
 delete mode 100644 tools/testing/selftests/bpf/test_cgroup_storage.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 7d4d5d3e0210..fd7ae37024e2 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -20,7 +20,6 @@ urandom_read
 test_sockmap
 test_lirc_mode2_user
 test_skb_cgroup_id_user
-test_cgroup_storage
 test_flow_dissector
 flow_dissector_load
 test_tcpnotify_user
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 8beade034caf..99f8d6329694 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -68,7 +68,6 @@ endif
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_sock test_sockmap \
-	test_cgroup_storage \
 	test_tcpnotify_user test_sysctl \
 	test_progs-no_alu32
 TEST_INST_SUBDIRS := no_alu32
@@ -295,7 +294,6 @@ $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS)
-$(OUTPUT)/test_cgroup_storage: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tag: $(TESTING_HELPERS)
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c b/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c
new file mode 100644
index 000000000000..cf395715ced4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+#include "cgroup_storage.skel.h"
+
+#define TEST_CGROUP "/test-bpf-cgroup-storage-buf/"
+#define TEST_NS "cgroup_storage_ns"
+#define PING_CMD "ping localhost -c 1 -W 1 -q"
+
+static int setup_network(struct nstoken **token)
+{
+	SYS(fail, "ip netns add %s", TEST_NS);
+	*token = open_netns(TEST_NS);
+	if (!ASSERT_OK_PTR(*token, "open netns"))
+		goto cleanup_ns;
+	SYS(cleanup_ns, "ip link set lo up");
+
+	return 0;
+
+cleanup_ns:
+	SYS_NOFAIL("ip netns del %s", TEST_NS);
+fail:
+	return -1;
+}
+
+static void cleanup_network(struct nstoken *ns)
+{
+	close_netns(ns);
+	SYS_NOFAIL("ip netns del %s", TEST_NS);
+}
+
+void test_cgroup_storage(void)
+{
+	struct bpf_cgroup_storage_key key;
+	struct cgroup_storage *skel;
+	struct nstoken *ns = NULL;
+	unsigned long long value;
+	int cgroup_fd;
+	int err;
+
+	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+	if (!ASSERT_OK_FD(cgroup_fd, "create cgroup"))
+		return;
+
+	if (!ASSERT_OK(setup_network(&ns), "setup network"))
+		goto cleanup_cgroup;
+
+	skel = cgroup_storage__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "load program"))
+		goto cleanup_network;
+
+	skel->links.bpf_prog =
+		bpf_program__attach_cgroup(skel->progs.bpf_prog, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links.bpf_prog, "attach program"))
+		goto cleanup_progs;
+
+	/* Check that one out of every two packets is dropped */
+	err = SYS_NOFAIL(PING_CMD);
+	ASSERT_OK(err, "first ping");
+	err = SYS_NOFAIL(PING_CMD);
+	ASSERT_NEQ(err, 0, "second ping");
+	err = SYS_NOFAIL(PING_CMD);
+	ASSERT_OK(err, "third ping");
+
+	err = bpf_map__get_next_key(skel->maps.cgroup_storage, NULL, &key,
+				    sizeof(key));
+	if (!ASSERT_OK(err, "get first key"))
+		goto cleanup_progs;
+	err = bpf_map__lookup_elem(skel->maps.cgroup_storage, &key, sizeof(key),
+				   &value, sizeof(value), 0);
+	if (!ASSERT_OK(err, "first packet count read"))
+		goto cleanup_progs;
+
+	/* Add one to the packet counter, check again packet filtering */
+	value++;
+	err = bpf_map__update_elem(skel->maps.cgroup_storage, &key, sizeof(key),
+				   &value, sizeof(value), 0);
+	if (!ASSERT_OK(err, "increment packet counter"))
+		goto cleanup_progs;
+	err = SYS_NOFAIL(PING_CMD);
+	ASSERT_OK(err, "fourth ping");
+	err = SYS_NOFAIL(PING_CMD);
+	ASSERT_NEQ(err, 0, "fifth ping");
+	err = SYS_NOFAIL(PING_CMD);
+	ASSERT_OK(err, "sixth ping");
+
+cleanup_progs:
+	cgroup_storage__destroy(skel);
+cleanup_network:
+	cleanup_network(ns);
+cleanup_cgroup:
+	close(cgroup_fd);
+	cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_storage.c b/tools/testing/selftests/bpf/progs/cgroup_storage.c
new file mode 100644
index 000000000000..db1e4d2d3281
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_storage.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+	__type(key, struct bpf_cgroup_storage_key);
+	__type(value, __u64);
+} cgroup_storage SEC(".maps");
+
+SEC("cgroup_skb/egress")
+int bpf_prog(struct __sk_buff *skb)
+{
+	__u64 *counter;
+
+	counter = bpf_get_local_storage(&cgroup_storage, 0);
+	__sync_fetch_and_add(counter, 1);
+
+	/* Drop one out of every two packets */
+	return (*counter & 1);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c
deleted file mode 100644
index 0861ea60dcdd..000000000000
--- a/tools/testing/selftests/bpf/test_cgroup_storage.c
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <assert.h>
-#include <bpf/bpf.h>
-#include <linux/filter.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/sysinfo.h>
-
-#include "bpf_util.h"
-#include "cgroup_helpers.h"
-#include "testing_helpers.h"
-
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
-
-#define TEST_CGROUP "/test-bpf-cgroup-storage-buf/"
-
-int main(int argc, char **argv)
-{
-	struct bpf_insn prog[] = {
-		BPF_LD_MAP_FD(BPF_REG_1, 0), /* percpu map fd */
-		BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-			     BPF_FUNC_get_local_storage),
-		BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0),
-		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1),
-		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
-
-		BPF_LD_MAP_FD(BPF_REG_1, 0), /* map fd */
-		BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-			     BPF_FUNC_get_local_storage),
-		BPF_MOV64_IMM(BPF_REG_1, 1),
-		BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
-		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
-		BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1),
-		BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
-		BPF_EXIT_INSN(),
-	};
-	size_t insns_cnt = ARRAY_SIZE(prog);
-	int error = EXIT_FAILURE;
-	int map_fd, percpu_map_fd, prog_fd, cgroup_fd;
-	struct bpf_cgroup_storage_key key;
-	unsigned long long value;
-	unsigned long long *percpu_value;
-	int cpu, nproc;
-
-	nproc = bpf_num_possible_cpus();
-	percpu_value = malloc(sizeof(*percpu_value) * nproc);
-	if (!percpu_value) {
-		printf("Not enough memory for per-cpu area (%d cpus)\n", nproc);
-		goto err;
-	}
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	map_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_STORAGE, NULL, sizeof(key),
-				sizeof(value), 0, NULL);
-	if (map_fd < 0) {
-		printf("Failed to create map: %s\n", strerror(errno));
-		goto out;
-	}
-
-	percpu_map_fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, NULL,
-				       sizeof(key), sizeof(value), 0, NULL);
-	if (percpu_map_fd < 0) {
-		printf("Failed to create map: %s\n", strerror(errno));
-		goto out;
-	}
-
-	prog[0].imm = percpu_map_fd;
-	prog[7].imm = map_fd;
-	prog_fd = bpf_test_load_program(BPF_PROG_TYPE_CGROUP_SKB,
-				   prog, insns_cnt, "GPL", 0,
-				   bpf_log_buf, BPF_LOG_BUF_SIZE);
-	if (prog_fd < 0) {
-		printf("Failed to load bpf program: %s\n", bpf_log_buf);
-		goto out;
-	}
-
-	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
-
-	/* Attach the bpf program */
-	if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) {
-		printf("Failed to attach bpf program\n");
-		goto err;
-	}
-
-	if (bpf_map_get_next_key(map_fd, NULL, &key)) {
-		printf("Failed to get the first key in cgroup storage\n");
-		goto err;
-	}
-
-	if (bpf_map_lookup_elem(map_fd, &key, &value)) {
-		printf("Failed to lookup cgroup storage 0\n");
-		goto err;
-	}
-
-	for (cpu = 0; cpu < nproc; cpu++)
-		percpu_value[cpu] = 1000;
-
-	if (bpf_map_update_elem(percpu_map_fd, &key, percpu_value, 0)) {
-		printf("Failed to update the data in the cgroup storage\n");
-		goto err;
-	}
-
-	/* Every second packet should be dropped */
-	assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0);
-	assert(system("ping localhost -c 1 -W 1 -q > /dev/null"));
-	assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0);
-
-	/* Check the counter in the cgroup local storage */
-	if (bpf_map_lookup_elem(map_fd, &key, &value)) {
-		printf("Failed to lookup cgroup storage\n");
-		goto err;
-	}
-
-	if (value != 3) {
-		printf("Unexpected data in the cgroup storage: %llu\n", value);
-		goto err;
-	}
-
-	/* Bump the counter in the cgroup local storage */
-	value++;
-	if (bpf_map_update_elem(map_fd, &key, &value, 0)) {
-		printf("Failed to update the data in the cgroup storage\n");
-		goto err;
-	}
-
-	/* Every second packet should be dropped */
-	assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0);
-	assert(system("ping localhost -c 1 -W 1 -q > /dev/null"));
-	assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0);
-
-	/* Check the final value of the counter in the cgroup local storage */
-	if (bpf_map_lookup_elem(map_fd, &key, &value)) {
-		printf("Failed to lookup the cgroup storage\n");
-		goto err;
-	}
-
-	if (value != 7) {
-		printf("Unexpected data in the cgroup storage: %llu\n", value);
-		goto err;
-	}
-
-	/* Check the final value of the counter in the percpu local storage */
-
-	for (cpu = 0; cpu < nproc; cpu++)
-		percpu_value[cpu] = 0;
-
-	if (bpf_map_lookup_elem(percpu_map_fd, &key, percpu_value)) {
-		printf("Failed to lookup the per-cpu cgroup storage\n");
-		goto err;
-	}
-
-	value = 0;
-	for (cpu = 0; cpu < nproc; cpu++)
-		value += percpu_value[cpu];
-
-	if (value != nproc * 1000 + 6) {
-		printf("Unexpected data in the per-cpu cgroup storage\n");
-		goto err;
-	}
-
-	error = 0;
-	printf("test_cgroup_storage:PASS\n");
-
-err:
-	cleanup_cgroup_environment();
-	free(percpu_value);
-
-out:
-	return error;
-}
-- 
cgit v1.2.3


From 7b4400a0a69b8255a2afed0bead0ce2c2c457545 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Tue, 13 Aug 2024 14:45:07 +0200
Subject: selftests/bpf: add proper section name to bpf prog and rename it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_skb_cgroup_id_kern.c is currently involved in a manual test. In its
current form, it can not be used with the auto-generated skeleton APIs,
because the section name is not valid to allow libbpf to deduce the program
type.

Update section name to allow skeleton APIs usage. Also rename the program
name to make it shorter and more straighforward regarding the API it is
testing. While doing so, make sure that test_skb_cgroup_id.sh passes to get
a working reference before converting it to test_progs
- update the obj name
- fix loading issue (verifier rejecting the program when loaded through tc,
  because of map not found), by preloading the whole obj with bpftool

Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-3-a33c03458cf6@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../testing/selftests/bpf/progs/cgroup_ancestor.c  | 45 ++++++++++++++++++++++
 .../selftests/bpf/progs/test_skb_cgroup_id_kern.c  | 45 ----------------------
 tools/testing/selftests/bpf/test_skb_cgroup_id.sh  | 12 ++++--
 3 files changed, 53 insertions(+), 49 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/cgroup_ancestor.c
 delete mode 100644 tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/cgroup_ancestor.c b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c
new file mode 100644
index 000000000000..4879645f5827
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+
+#include <string.h>
+
+#include <bpf/bpf_helpers.h>
+
+#define NUM_CGROUP_LEVELS	4
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, __u64);
+	__uint(max_entries, NUM_CGROUP_LEVELS);
+} cgroup_ids SEC(".maps");
+
+static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level)
+{
+	__u64 id;
+
+	/* [1] &level passed to external function that may change it, it's
+	 *     incompatible with loop unroll.
+	 */
+	id = bpf_skb_ancestor_cgroup_id(skb, level);
+	bpf_map_update_elem(&cgroup_ids, &level, &id, 0);
+}
+
+SEC("tc")
+int log_cgroup_id(struct __sk_buff *skb)
+{
+	/* Loop unroll can't be used here due to [1]. Unrolling manually.
+	 * Number of calls should be in sync with NUM_CGROUP_LEVELS.
+	 */
+	log_nth_level(skb, 0);
+	log_nth_level(skb, 1);
+	log_nth_level(skb, 2);
+	log_nth_level(skb, 3);
+
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c
deleted file mode 100644
index 37aacc66cd68..000000000000
--- a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2018 Facebook
-
-#include <linux/bpf.h>
-#include <linux/pkt_cls.h>
-
-#include <string.h>
-
-#include <bpf/bpf_helpers.h>
-
-#define NUM_CGROUP_LEVELS	4
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, __u32);
-	__type(value, __u64);
-	__uint(max_entries, NUM_CGROUP_LEVELS);
-} cgroup_ids SEC(".maps");
-
-static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level)
-{
-	__u64 id;
-
-	/* [1] &level passed to external function that may change it, it's
-	 *     incompatible with loop unroll.
-	 */
-	id = bpf_skb_ancestor_cgroup_id(skb, level);
-	bpf_map_update_elem(&cgroup_ids, &level, &id, 0);
-}
-
-SEC("cgroup_id_logger")
-int log_cgroup_id(struct __sk_buff *skb)
-{
-	/* Loop unroll can't be used here due to [1]. Unrolling manually.
-	 * Number of calls should be in sync with NUM_CGROUP_LEVELS.
-	 */
-	log_nth_level(skb, 0);
-	log_nth_level(skb, 1);
-	log_nth_level(skb, 2);
-	log_nth_level(skb, 3);
-
-	return TC_ACT_OK;
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
index 515c2eafc97f..d7dad49175c2 100755
--- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
+++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
@@ -30,8 +30,10 @@ setup()
 	wait_for_ip
 
 	tc qdisc add dev ${TEST_IF} clsact
-	tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \
-		sec ${BPF_PROG_SECTION} da
+	mkdir -p /sys/fs/bpf/${BPF_PROG_PIN}
+	bpftool prog loadall ${BPF_PROG_OBJ} /sys/fs/bpf/${BPF_PROG_PIN} type tc
+	tc filter add dev ${TEST_IF} egress bpf da object-pinned \
+		/sys/fs/bpf/${BPF_PROG_PIN}/${BPF_PROG_NAME}
 
 	BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \
 			awk '/ id / {sub(/.* id /, "", $0); print($1)}')
@@ -41,6 +43,7 @@ cleanup()
 {
 	ip link del ${TEST_IF} 2>/dev/null || :
 	ip link del ${TEST_IF_PEER} 2>/dev/null || :
+	rm -rf /sys/fs/bpf/${BPF_PROG_PIN}
 }
 
 main()
@@ -54,8 +57,9 @@ DIR=$(dirname $0)
 TEST_IF="test_cgid_1"
 TEST_IF_PEER="test_cgid_2"
 MAX_PING_TRIES=5
-BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.bpf.o"
-BPF_PROG_SECTION="cgroup_id_logger"
+BPF_PROG_PIN="cgroup_ancestor"
+BPF_PROG_OBJ="${DIR}/${BPF_PROG_PIN}.bpf.o"
+BPF_PROG_NAME="log_cgroup_id"
 BPF_PROG_ID=0
 PROG="${DIR}/test_skb_cgroup_id_user"
 type ping6 >/dev/null 2>&1 && PING6="ping6" || PING6="ping -6"
-- 
cgit v1.2.3


From f957c230e173cf227566686015016d05f7102d27 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Tue, 13 Aug 2024 14:45:08 +0200
Subject: selftests/bpf: convert test_skb_cgroup_id_user to test_progs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_skb_cgroup_id_user allows testing skb cgroup id retrieval at different
levels, but is not integrated in test_progs, so it is not run
automatically in CI. The test overlaps a bit with
cgroup_skb_sk_lookup_kern, which is integrated in test_progs and test
extensively skb cgroup helpers, but there is still one major difference
between the two tests which justifies the conversion:
cgroup_skb_sk_lookup_kern deals with a BPF_PROG_TYPE_CGROUP_SKB (attached
on a cgroup), while test_skb_cgroup_id_user deals with a
BPF_PROG_TYPE_SCHED_CLS (attached on a qdisc)

Convert test_skb_cgroup_id_user into test_progs framework in order to run
it automatically in CI. The main differences with the original test are the
following:
- rename the test to make it shorter and more straightforward regarding
  tested feature
- the wrapping shell script has been dropped since every setup step is now
  handled in the main C test file
- the test has been renamed for a shorter name and reflecting the tested
  API
- add dedicated assert log per level to ease test failure debugging
- use global variables instead of maps to access bpf prog data

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-4-a33c03458cf6@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore             |   1 -
 tools/testing/selftests/bpf/Makefile               |   3 +-
 .../selftests/bpf/prog_tests/cgroup_ancestor.c     | 141 ++++++++++++++++
 .../testing/selftests/bpf/progs/cgroup_ancestor.c  |  41 ++---
 tools/testing/selftests/bpf/test_skb_cgroup_id.sh  |  67 --------
 .../selftests/bpf/test_skb_cgroup_id_user.c        | 183 ---------------------
 6 files changed, 160 insertions(+), 276 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c
 delete mode 100755 tools/testing/selftests/bpf/test_skb_cgroup_id.sh
 delete mode 100644 tools/testing/selftests/bpf/test_skb_cgroup_id_user.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index fd7ae37024e2..99ffea1fa5c6 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -19,7 +19,6 @@ test_sock
 urandom_read
 test_sockmap
 test_lirc_mode2_user
-test_skb_cgroup_id_user
 test_flow_dissector
 flow_dissector_load
 test_tcpnotify_user
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 99f8d6329694..163b11014158 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -137,7 +137,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 	test_xdp_vlan.sh test_bpftool.py
 
 # Compile but not part of 'make run_tests'
-TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \
+TEST_GEN_PROGS_EXTENDED = \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
 	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
 	xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \
@@ -290,7 +290,6 @@ JSON_WRITER		:= $(OUTPUT)/json_writer.o
 CAP_HELPERS	:= $(OUTPUT)/cap_helpers.o
 NETWORK_HELPERS := $(OUTPUT)/network_helpers.o
 
-$(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS)
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c b/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c
new file mode 100644
index 000000000000..9250a1e9f9af
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "cgroup_helpers.h"
+#include "cgroup_ancestor.skel.h"
+
+#define CGROUP_PATH "/skb_cgroup_test"
+#define TEST_NS "cgroup_ancestor_ns"
+#define NUM_CGROUP_LEVELS 4
+#define WAIT_AUTO_IP_MAX_ATTEMPT 10
+#define DST_ADDR "::1"
+#define DST_PORT 1234
+#define MAX_ASSERT_NAME 32
+
+struct test_data {
+	struct cgroup_ancestor *skel;
+	struct bpf_tc_hook qdisc;
+	struct bpf_tc_opts tc_attach;
+	struct nstoken *ns;
+};
+
+static int send_datagram(void)
+{
+	unsigned char buf[] = "some random test data";
+	struct sockaddr_in6 addr = { .sin6_family = AF_INET6,
+				     .sin6_port = htons(DST_PORT), };
+	int sock, n;
+
+	if (!ASSERT_EQ(inet_pton(AF_INET6, DST_ADDR, &addr.sin6_addr), 1,
+		       "inet_pton"))
+		return -1;
+
+	sock = socket(AF_INET6, SOCK_DGRAM, 0);
+	if (!ASSERT_OK_FD(sock, "create socket"))
+		return sock;
+
+	if (!ASSERT_OK(connect(sock, &addr, sizeof(addr)), "connect")) {
+		close(sock);
+		return -1;
+	}
+
+	n = sendto(sock, buf, sizeof(buf), 0, (const struct sockaddr *)&addr,
+		   sizeof(addr));
+	close(sock);
+	return ASSERT_EQ(n, sizeof(buf), "send data") ? 0 : -1;
+}
+
+static int setup_network(struct test_data *t)
+{
+	SYS(fail, "ip netns add %s", TEST_NS);
+	t->ns = open_netns(TEST_NS);
+	if (!ASSERT_OK_PTR(t->ns, "open netns"))
+		goto cleanup_ns;
+
+	SYS(close_ns, "ip link set lo up");
+
+	memset(&t->qdisc, 0, sizeof(t->qdisc));
+	t->qdisc.sz = sizeof(t->qdisc);
+	t->qdisc.attach_point = BPF_TC_EGRESS;
+	t->qdisc.ifindex = if_nametoindex("lo");
+	if (!ASSERT_NEQ(t->qdisc.ifindex, 0, "if_nametoindex"))
+		goto close_ns;
+	if (!ASSERT_OK(bpf_tc_hook_create(&t->qdisc), "qdisc add"))
+		goto close_ns;
+
+	memset(&t->tc_attach, 0, sizeof(t->tc_attach));
+	t->tc_attach.sz = sizeof(t->tc_attach);
+	t->tc_attach.prog_fd = bpf_program__fd(t->skel->progs.log_cgroup_id);
+	if (!ASSERT_OK(bpf_tc_attach(&t->qdisc, &t->tc_attach), "filter add"))
+		goto cleanup_qdisc;
+
+	return 0;
+
+cleanup_qdisc:
+	bpf_tc_hook_destroy(&t->qdisc);
+close_ns:
+	close_netns(t->ns);
+cleanup_ns:
+	SYS_NOFAIL("ip netns del %s", TEST_NS);
+fail:
+	return 1;
+}
+
+static void cleanup_network(struct test_data *t)
+{
+	bpf_tc_detach(&t->qdisc, &t->tc_attach);
+	bpf_tc_hook_destroy(&t->qdisc);
+	close_netns(t->ns);
+	SYS_NOFAIL("ip netns del %s", TEST_NS);
+}
+
+static void check_ancestors_ids(struct test_data *t)
+{
+	__u64 expected_ids[NUM_CGROUP_LEVELS];
+	char assert_name[MAX_ASSERT_NAME];
+	__u32 level;
+
+	expected_ids[0] = get_cgroup_id("/.."); /* root cgroup */
+	expected_ids[1] = get_cgroup_id("");
+	expected_ids[2] = get_cgroup_id(CGROUP_PATH);
+	expected_ids[3] = 0; /* non-existent cgroup */
+
+	for (level = 0; level < NUM_CGROUP_LEVELS; level++) {
+		snprintf(assert_name, MAX_ASSERT_NAME,
+			 "ancestor id at level %d", level);
+		ASSERT_EQ(t->skel->bss->cgroup_ids[level], expected_ids[level],
+			  assert_name);
+	}
+}
+
+void test_cgroup_ancestor(void)
+{
+	struct test_data t;
+	int cgroup_fd;
+
+	t.skel = cgroup_ancestor__open_and_load();
+	if (!ASSERT_OK_PTR(t.skel, "open and load"))
+		return;
+
+	t.skel->bss->dport = htons(DST_PORT);
+	cgroup_fd = cgroup_setup_and_join(CGROUP_PATH);
+	if (cgroup_fd < 0)
+		goto cleanup_progs;
+
+	if (setup_network(&t))
+		goto cleanup_cgroups;
+
+	if (send_datagram())
+		goto cleanup_network;
+
+	check_ancestors_ids(&t);
+
+cleanup_network:
+	cleanup_network(&t);
+cleanup_cgroups:
+	close(cgroup_fd);
+	cleanup_cgroup_environment();
+cleanup_progs:
+	cgroup_ancestor__destroy(t.skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_ancestor.c b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c
index 4879645f5827..8c2deb4fc493 100644
--- a/tools/testing/selftests/bpf/progs/cgroup_ancestor.c
+++ b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c
@@ -1,43 +1,38 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2018 Facebook
 
-#include <linux/bpf.h>
-#include <linux/pkt_cls.h>
-
-#include <string.h>
-
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-
+#include <bpf/bpf_core_read.h>
+#include "bpf_tracing_net.h"
 #define NUM_CGROUP_LEVELS	4
 
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, __u32);
-	__type(value, __u64);
-	__uint(max_entries, NUM_CGROUP_LEVELS);
-} cgroup_ids SEC(".maps");
+__u64 cgroup_ids[NUM_CGROUP_LEVELS];
+__u16 dport;
 
 static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level)
 {
-	__u64 id;
-
 	/* [1] &level passed to external function that may change it, it's
 	 *     incompatible with loop unroll.
 	 */
-	id = bpf_skb_ancestor_cgroup_id(skb, level);
-	bpf_map_update_elem(&cgroup_ids, &level, &id, 0);
+	cgroup_ids[level] = bpf_skb_ancestor_cgroup_id(skb, level);
 }
 
 SEC("tc")
 int log_cgroup_id(struct __sk_buff *skb)
 {
-	/* Loop unroll can't be used here due to [1]. Unrolling manually.
-	 * Number of calls should be in sync with NUM_CGROUP_LEVELS.
-	 */
-	log_nth_level(skb, 0);
-	log_nth_level(skb, 1);
-	log_nth_level(skb, 2);
-	log_nth_level(skb, 3);
+	struct sock *sk = (void *)skb->sk;
+
+	if (!sk)
+		return TC_ACT_OK;
+
+	sk = bpf_core_cast(sk, struct sock);
+	if (sk->sk_protocol == IPPROTO_UDP && sk->sk_dport == dport) {
+		log_nth_level(skb, 0);
+		log_nth_level(skb, 1);
+		log_nth_level(skb, 2);
+		log_nth_level(skb, 3);
+	}
 
 	return TC_ACT_OK;
 }
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
deleted file mode 100755
index d7dad49175c2..000000000000
--- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-# Copyright (c) 2018 Facebook
-
-set -eu
-
-wait_for_ip()
-{
-	local _i
-	echo -n "Wait for testing link-local IP to become available "
-	for _i in $(seq ${MAX_PING_TRIES}); do
-		echo -n "."
-		if $PING6 -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then
-			echo " OK"
-			return
-		fi
-		sleep 1
-	done
-	echo 1>&2 "ERROR: Timeout waiting for test IP to become available."
-	exit 1
-}
-
-setup()
-{
-	# Create testing interfaces not to interfere with current environment.
-	ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER}
-	ip link set ${TEST_IF} up
-	ip link set ${TEST_IF_PEER} up
-
-	wait_for_ip
-
-	tc qdisc add dev ${TEST_IF} clsact
-	mkdir -p /sys/fs/bpf/${BPF_PROG_PIN}
-	bpftool prog loadall ${BPF_PROG_OBJ} /sys/fs/bpf/${BPF_PROG_PIN} type tc
-	tc filter add dev ${TEST_IF} egress bpf da object-pinned \
-		/sys/fs/bpf/${BPF_PROG_PIN}/${BPF_PROG_NAME}
-
-	BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \
-			awk '/ id / {sub(/.* id /, "", $0); print($1)}')
-}
-
-cleanup()
-{
-	ip link del ${TEST_IF} 2>/dev/null || :
-	ip link del ${TEST_IF_PEER} 2>/dev/null || :
-	rm -rf /sys/fs/bpf/${BPF_PROG_PIN}
-}
-
-main()
-{
-	trap cleanup EXIT 2 3 6 15
-	setup
-	${PROG} ${TEST_IF} ${BPF_PROG_ID}
-}
-
-DIR=$(dirname $0)
-TEST_IF="test_cgid_1"
-TEST_IF_PEER="test_cgid_2"
-MAX_PING_TRIES=5
-BPF_PROG_PIN="cgroup_ancestor"
-BPF_PROG_OBJ="${DIR}/${BPF_PROG_PIN}.bpf.o"
-BPF_PROG_NAME="log_cgroup_id"
-BPF_PROG_ID=0
-PROG="${DIR}/test_skb_cgroup_id_user"
-type ping6 >/dev/null 2>&1 && PING6="ping6" || PING6="ping -6"
-
-main
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
deleted file mode 100644
index ed518d075d1d..000000000000
--- a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
+++ /dev/null
@@ -1,183 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2018 Facebook
-
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <arpa/inet.h>
-#include <net/if.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "cgroup_helpers.h"
-
-#define CGROUP_PATH		"/skb_cgroup_test"
-#define NUM_CGROUP_LEVELS	4
-
-/* RFC 4291, Section 2.7.1 */
-#define LINKLOCAL_MULTICAST	"ff02::1"
-
-static int mk_dst_addr(const char *ip, const char *iface,
-		       struct sockaddr_in6 *dst)
-{
-	memset(dst, 0, sizeof(*dst));
-
-	dst->sin6_family = AF_INET6;
-	dst->sin6_port = htons(1025);
-
-	if (inet_pton(AF_INET6, ip, &dst->sin6_addr) != 1) {
-		log_err("Invalid IPv6: %s", ip);
-		return -1;
-	}
-
-	dst->sin6_scope_id = if_nametoindex(iface);
-	if (!dst->sin6_scope_id) {
-		log_err("Failed to get index of iface: %s", iface);
-		return -1;
-	}
-
-	return 0;
-}
-
-static int send_packet(const char *iface)
-{
-	struct sockaddr_in6 dst;
-	char msg[] = "msg";
-	int err = 0;
-	int fd = -1;
-
-	if (mk_dst_addr(LINKLOCAL_MULTICAST, iface, &dst))
-		goto err;
-
-	fd = socket(AF_INET6, SOCK_DGRAM, 0);
-	if (fd == -1) {
-		log_err("Failed to create UDP socket");
-		goto err;
-	}
-
-	if (sendto(fd, &msg, sizeof(msg), 0, (const struct sockaddr *)&dst,
-		   sizeof(dst)) == -1) {
-		log_err("Failed to send datagram");
-		goto err;
-	}
-
-	goto out;
-err:
-	err = -1;
-out:
-	if (fd >= 0)
-		close(fd);
-	return err;
-}
-
-int get_map_fd_by_prog_id(int prog_id)
-{
-	struct bpf_prog_info info = {};
-	__u32 info_len = sizeof(info);
-	__u32 map_ids[1];
-	int prog_fd = -1;
-	int map_fd = -1;
-
-	prog_fd = bpf_prog_get_fd_by_id(prog_id);
-	if (prog_fd < 0) {
-		log_err("Failed to get fd by prog id %d", prog_id);
-		goto err;
-	}
-
-	info.nr_map_ids = 1;
-	info.map_ids = (__u64) (unsigned long) map_ids;
-
-	if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) {
-		log_err("Failed to get info by prog fd %d", prog_fd);
-		goto err;
-	}
-
-	if (!info.nr_map_ids) {
-		log_err("No maps found for prog fd %d", prog_fd);
-		goto err;
-	}
-
-	map_fd = bpf_map_get_fd_by_id(map_ids[0]);
-	if (map_fd < 0)
-		log_err("Failed to get fd by map id %d", map_ids[0]);
-err:
-	if (prog_fd >= 0)
-		close(prog_fd);
-	return map_fd;
-}
-
-int check_ancestor_cgroup_ids(int prog_id)
-{
-	__u64 actual_ids[NUM_CGROUP_LEVELS], expected_ids[NUM_CGROUP_LEVELS];
-	__u32 level;
-	int err = 0;
-	int map_fd;
-
-	expected_ids[0] = get_cgroup_id("/..");	/* root cgroup */
-	expected_ids[1] = get_cgroup_id("");
-	expected_ids[2] = get_cgroup_id(CGROUP_PATH);
-	expected_ids[3] = 0; /* non-existent cgroup */
-
-	map_fd = get_map_fd_by_prog_id(prog_id);
-	if (map_fd < 0)
-		goto err;
-
-	for (level = 0; level < NUM_CGROUP_LEVELS; ++level) {
-		if (bpf_map_lookup_elem(map_fd, &level, &actual_ids[level])) {
-			log_err("Failed to lookup key %d", level);
-			goto err;
-		}
-		if (actual_ids[level] != expected_ids[level]) {
-			log_err("%llx (actual) != %llx (expected), level: %u\n",
-				actual_ids[level], expected_ids[level], level);
-			goto err;
-		}
-	}
-
-	goto out;
-err:
-	err = -1;
-out:
-	if (map_fd >= 0)
-		close(map_fd);
-	return err;
-}
-
-int main(int argc, char **argv)
-{
-	int cgfd = -1;
-	int err = 0;
-
-	if (argc < 3) {
-		fprintf(stderr, "Usage: %s iface prog_id\n", argv[0]);
-		exit(EXIT_FAILURE);
-	}
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	cgfd = cgroup_setup_and_join(CGROUP_PATH);
-	if (cgfd < 0)
-		goto err;
-
-	if (send_packet(argv[1]))
-		goto err;
-
-	if (check_ancestor_cgroup_ids(atoi(argv[2])))
-		goto err;
-
-	goto out;
-err:
-	err = -1;
-out:
-	close(cgfd);
-	cleanup_cgroup_environment();
-	printf("[%s]\n", err ? "FAIL" : "PASS");
-	return err;
-}
-- 
cgit v1.2.3


From ac01c8c4246546fd8340a232f3ada1921dc0ee48 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@readmodwrite.com>
Date: Thu, 15 Aug 2024 15:22:12 +0100
Subject: perf hist: Update hist symbol when updating maps

AddressSanitizer found a use-after-free bug in the symbol code which
manifested as 'perf top' segfaulting.

  ==1238389==ERROR: AddressSanitizer: heap-use-after-free on address 0x60b00c48844b at pc 0x5650d8035961 bp 0x7f751aaecc90 sp 0x7f751aaecc80
  READ of size 1 at 0x60b00c48844b thread T193
      #0 0x5650d8035960 in _sort__sym_cmp util/sort.c:310
      #1 0x5650d8043744 in hist_entry__cmp util/hist.c:1286
      #2 0x5650d8043951 in hists__findnew_entry util/hist.c:614
      #3 0x5650d804568f in __hists__add_entry util/hist.c:754
      #4 0x5650d8045bf9 in hists__add_entry util/hist.c:772
      #5 0x5650d8045df1 in iter_add_single_normal_entry util/hist.c:997
      #6 0x5650d8043326 in hist_entry_iter__add util/hist.c:1242
      #7 0x5650d7ceeefe in perf_event__process_sample /home/matt/src/linux/tools/perf/builtin-top.c:845
      #8 0x5650d7ceeefe in deliver_event /home/matt/src/linux/tools/perf/builtin-top.c:1208
      #9 0x5650d7fdb51b in do_flush util/ordered-events.c:245
      #10 0x5650d7fdb51b in __ordered_events__flush util/ordered-events.c:324
      #11 0x5650d7ced743 in process_thread /home/matt/src/linux/tools/perf/builtin-top.c:1120
      #12 0x7f757ef1f133 in start_thread nptl/pthread_create.c:442
      #13 0x7f757ef9f7db in clone3 ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

When updating hist maps it's also necessary to update the hist symbol
reference because the old one gets freed in map__put().

While this bug was probably introduced with 5c24b67aae72f54c ("perf
tools: Replace map->referenced & maps->removed_maps with map->refcnt"),
the symbol objects were leaked until c087e9480cf33672 ("perf machine:
Fix refcount usage when processing PERF_RECORD_KSYMBOL") was merged so
the bug was masked.

Fixes: c087e9480cf33672 ("perf machine: Fix refcount usage when processing PERF_RECORD_KSYMBOL")
Reported-by: Yunzhao Li <yunzhao@cloudflare.com>
Signed-off-by: Matt Fleming (Cloudflare) <matt@readmodwrite.com>
Cc: Ian Rogers <irogers@google.com>
Cc: kernel-team@cloudflare.com
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: stable@vger.kernel.org # v5.13+
Link: https://lore.kernel.org/r/20240815142212.3834625-1-matt@readmodwrite.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 0f554febf9a1..0f9ce2ee2c31 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -639,6 +639,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 			 * the history counter to increment.
 			 */
 			if (he->ms.map != entry->ms.map) {
+				if (he->ms.sym) {
+					u64 addr = he->ms.sym->start;
+					he->ms.sym = map__find_symbol(entry->ms.map, addr);
+				}
+
 				map__put(he->ms.map);
 				he->ms.map = map__get(entry->ms.map);
 			}
-- 
cgit v1.2.3


From f52403b6bfea6994b017c58367aee9acdb9d9fd4 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Wed, 14 Aug 2024 22:32:49 -0700
Subject: selftests/bpf: Add traffic monitor functions.

Add functions that capture packets and print log in the background. They
are supposed to be used for debugging flaky network test cases. A monitored
test case should call traffic_monitor_start() to start a thread to capture
packets in the background for a given namespace and call
traffic_monitor_stop() to stop capturing. (Or, option '-m' implemented by
the later patches.)

    lo      In  IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 68, SYN
    lo      In  IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 60, SYN, ACK
    lo      In  IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 60, ACK
    lo      In  IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 52, ACK
    lo      In  IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 52, FIN, ACK
    lo      In  IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 52, RST, ACK
    Packet file: packets-2173-86-select_reuseport:sockhash_IPv4_TCP_LOOPBACK_test_detach_bpf-test.log
    #280/87 select_reuseport/sockhash IPv4/TCP LOOPBACK test_detach_bpf:OK

The above is the output of an example. It shows the packets of a connection
and the name of the file that contains captured packets in the directory
/tmp/tmon_pcap. The file can be loaded by tcpdump or wireshark.

This feature only works if libpcap is available. (Could be found by pkg-config)

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240815053254.470944-2-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/Makefile          |   4 +
 tools/testing/selftests/bpf/network_helpers.c | 454 ++++++++++++++++++++++++++
 tools/testing/selftests/bpf/network_helpers.h |  18 +
 3 files changed, 476 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 163b11014158..4eceb491a8ae 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -41,6 +41,10 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic					\
 LDFLAGS += $(SAN_LDFLAGS)
 LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread
 
+LDLIBS += $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null)
+CFLAGS += $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null)
+CFLAGS += $(shell $(PKG_CONFIG) --exists libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1")
+
 # The following tests perform type punning and they may break strict
 # aliasing rules, which are exploited by both GCC and clang by default
 # while optimizing.  This can lead to broken programs.
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index a3f0a49fb26f..e0b0a69f066e 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -11,17 +11,31 @@
 #include <arpa/inet.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <sys/un.h>
+#include <sys/eventfd.h>
 
 #include <linux/err.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/limits.h>
 
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <netinet/tcp.h>
+#include <net/if.h>
+
 #include "bpf_util.h"
 #include "network_helpers.h"
 #include "test_progs.h"
 
+#ifdef TRAFFIC_MONITOR
+/* Prevent pcap.h from including pcap/bpf.h and causing conflicts */
+#define PCAP_DONT_INCLUDE_PCAP_BPF_H 1
+#include <pcap/pcap.h>
+#include <pcap/dlt.h>
+#endif
+
 #ifndef IPPROTO_MPTCP
 #define IPPROTO_MPTCP 262
 #endif
@@ -660,3 +674,443 @@ int send_recv_data(int lfd, int fd, uint32_t total_bytes)
 
 	return err;
 }
+
+#ifdef TRAFFIC_MONITOR
+struct tmonitor_ctx {
+	pcap_t *pcap;
+	pcap_dumper_t *dumper;
+	pthread_t thread;
+	int wake_fd;
+
+	volatile bool done;
+	char pkt_fname[PATH_MAX];
+	int pcap_fd;
+};
+
+/* Is this packet captured with a Ethernet protocol type? */
+static bool is_ethernet(const u_char *packet)
+{
+	u16 arphdr_type;
+
+	memcpy(&arphdr_type, packet + 8, 2);
+	arphdr_type = ntohs(arphdr_type);
+
+	/* Except the following cases, the protocol type contains the
+	 * Ethernet protocol type for the packet.
+	 *
+	 * https://www.tcpdump.org/linktypes/LINKTYPE_LINUX_SLL2.html
+	 */
+	switch (arphdr_type) {
+	case 770: /* ARPHRD_FRAD */
+	case 778: /* ARPHDR_IPGRE */
+	case 803: /* ARPHRD_IEEE80211_RADIOTAP */
+		printf("Packet captured: arphdr_type=%d\n", arphdr_type);
+		return false;
+	}
+	return true;
+}
+
+static const char * const pkt_types[] = {
+	"In",
+	"B",			/* Broadcast */
+	"M",			/* Multicast */
+	"C",			/* Captured with the promiscuous mode */
+	"Out",
+};
+
+static const char *pkt_type_str(u16 pkt_type)
+{
+	if (pkt_type < ARRAY_SIZE(pkt_types))
+		return pkt_types[pkt_type];
+	return "Unknown";
+}
+
+/* Show the information of the transport layer in the packet */
+static void show_transport(const u_char *packet, u16 len, u32 ifindex,
+			   const char *src_addr, const char *dst_addr,
+			   u16 proto, bool ipv6, u8 pkt_type)
+{
+	char *ifname, _ifname[IF_NAMESIZE];
+	const char *transport_str;
+	u16 src_port, dst_port;
+	struct udphdr *udp;
+	struct tcphdr *tcp;
+
+	ifname = if_indextoname(ifindex, _ifname);
+	if (!ifname) {
+		snprintf(_ifname, sizeof(_ifname), "unknown(%d)", ifindex);
+		ifname = _ifname;
+	}
+
+	if (proto == IPPROTO_UDP) {
+		udp = (struct udphdr *)packet;
+		src_port = ntohs(udp->source);
+		dst_port = ntohs(udp->dest);
+		transport_str = "UDP";
+	} else if (proto == IPPROTO_TCP) {
+		tcp = (struct tcphdr *)packet;
+		src_port = ntohs(tcp->source);
+		dst_port = ntohs(tcp->dest);
+		transport_str = "TCP";
+	} else if (proto == IPPROTO_ICMP) {
+		printf("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n",
+		       ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len,
+		       packet[0], packet[1]);
+		return;
+	} else if (proto == IPPROTO_ICMPV6) {
+		printf("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n",
+		       ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len,
+		       packet[0], packet[1]);
+		return;
+	} else {
+		printf("%-7s %-3s %s %s > %s: protocol %d\n",
+		       ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4",
+		       src_addr, dst_addr, proto);
+		return;
+	}
+
+	/* TCP or UDP*/
+
+	flockfile(stdout);
+	if (ipv6)
+		printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d",
+		       ifname, pkt_type_str(pkt_type), src_addr, src_port,
+		       dst_addr, dst_port, transport_str, len);
+	else
+		printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d",
+		       ifname, pkt_type_str(pkt_type), src_addr, src_port,
+		       dst_addr, dst_port, transport_str, len);
+
+	if (proto == IPPROTO_TCP) {
+		if (tcp->fin)
+			printf(", FIN");
+		if (tcp->syn)
+			printf(", SYN");
+		if (tcp->rst)
+			printf(", RST");
+		if (tcp->ack)
+			printf(", ACK");
+	}
+
+	printf("\n");
+	funlockfile(stdout);
+}
+
+static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type)
+{
+	char src_buf[INET6_ADDRSTRLEN], dst_buf[INET6_ADDRSTRLEN];
+	struct ipv6hdr *pkt = (struct ipv6hdr *)packet;
+	const char *src, *dst;
+	u_char proto;
+
+	src = inet_ntop(AF_INET6, &pkt->saddr, src_buf, sizeof(src_buf));
+	if (!src)
+		src = "<invalid>";
+	dst = inet_ntop(AF_INET6, &pkt->daddr, dst_buf, sizeof(dst_buf));
+	if (!dst)
+		dst = "<invalid>";
+	proto = pkt->nexthdr;
+	show_transport(packet + sizeof(struct ipv6hdr),
+		       ntohs(pkt->payload_len),
+		       ifindex, src, dst, proto, true, pkt_type);
+}
+
+static void show_ipv4_packet(const u_char *packet, u32 ifindex, u8 pkt_type)
+{
+	char src_buf[INET_ADDRSTRLEN], dst_buf[INET_ADDRSTRLEN];
+	struct iphdr *pkt = (struct iphdr *)packet;
+	const char *src, *dst;
+	u_char proto;
+
+	src = inet_ntop(AF_INET, &pkt->saddr, src_buf, sizeof(src_buf));
+	if (!src)
+		src = "<invalid>";
+	dst = inet_ntop(AF_INET, &pkt->daddr, dst_buf, sizeof(dst_buf));
+	if (!dst)
+		dst = "<invalid>";
+	proto = pkt->protocol;
+	show_transport(packet + sizeof(struct iphdr),
+		       ntohs(pkt->tot_len),
+		       ifindex, src, dst, proto, false, pkt_type);
+}
+
+static void *traffic_monitor_thread(void *arg)
+{
+	char *ifname, _ifname[IF_NAMESIZE];
+	const u_char *packet, *payload;
+	struct tmonitor_ctx *ctx = arg;
+	pcap_dumper_t *dumper = ctx->dumper;
+	int fd = ctx->pcap_fd, nfds, r;
+	int wake_fd = ctx->wake_fd;
+	struct pcap_pkthdr header;
+	pcap_t *pcap = ctx->pcap;
+	u32 ifindex;
+	fd_set fds;
+	u16 proto;
+	u8 ptype;
+
+	nfds = (fd > wake_fd ? fd : wake_fd) + 1;
+	FD_ZERO(&fds);
+
+	while (!ctx->done) {
+		FD_SET(fd, &fds);
+		FD_SET(wake_fd, &fds);
+		r = select(nfds, &fds, NULL, NULL, NULL);
+		if (!r)
+			continue;
+		if (r < 0) {
+			if (errno == EINTR)
+				continue;
+			log_err("Fail to select on pcap fd and wake fd");
+			break;
+		}
+
+		/* This instance of pcap is non-blocking */
+		packet = pcap_next(pcap, &header);
+		if (!packet)
+			continue;
+
+		/* According to the man page of pcap_dump(), first argument
+		 * is the pcap_dumper_t pointer even it's argument type is
+		 * u_char *.
+		 */
+		pcap_dump((u_char *)dumper, &header, packet);
+
+		/* Not sure what other types of packets look like. Here, we
+		 * parse only Ethernet and compatible packets.
+		 */
+		if (!is_ethernet(packet))
+			continue;
+
+		/* Skip SLL2 header
+		 * https://www.tcpdump.org/linktypes/LINKTYPE_LINUX_SLL2.html
+		 *
+		 * Although the document doesn't mention that, the payload
+		 * doesn't include the Ethernet header. The payload starts
+		 * from the first byte of the network layer header.
+		 */
+		payload = packet + 20;
+
+		memcpy(&proto, packet, 2);
+		proto = ntohs(proto);
+		memcpy(&ifindex, packet + 4, 4);
+		ifindex = ntohl(ifindex);
+		ptype = packet[10];
+
+		if (proto == ETH_P_IPV6) {
+			show_ipv6_packet(payload, ifindex, ptype);
+		} else if (proto == ETH_P_IP) {
+			show_ipv4_packet(payload, ifindex, ptype);
+		} else {
+			ifname = if_indextoname(ifindex, _ifname);
+			if (!ifname) {
+				snprintf(_ifname, sizeof(_ifname), "unknown(%d)", ifindex);
+				ifname = _ifname;
+			}
+
+			printf("%-7s %-3s Unknown network protocol type 0x%x\n",
+			       ifname, pkt_type_str(ptype), proto);
+		}
+	}
+
+	return NULL;
+}
+
+/* Prepare the pcap handle to capture packets.
+ *
+ * This pcap is non-blocking and immediate mode is enabled to receive
+ * captured packets as soon as possible.  The snaplen is set to 1024 bytes
+ * to limit the size of captured content. The format of the link-layer
+ * header is set to DLT_LINUX_SLL2 to enable handling various link-layer
+ * technologies.
+ */
+static pcap_t *traffic_monitor_prepare_pcap(void)
+{
+	char errbuf[PCAP_ERRBUF_SIZE];
+	pcap_t *pcap;
+	int r;
+
+	/* Listen on all NICs in the namespace */
+	pcap = pcap_create("any", errbuf);
+	if (!pcap) {
+		log_err("Failed to open pcap: %s", errbuf);
+		return NULL;
+	}
+	/* Limit the size of the packet (first N bytes) */
+	r = pcap_set_snaplen(pcap, 1024);
+	if (r) {
+		log_err("Failed to set snaplen: %s", pcap_geterr(pcap));
+		goto error;
+	}
+	/* To receive packets as fast as possible */
+	r = pcap_set_immediate_mode(pcap, 1);
+	if (r) {
+		log_err("Failed to set immediate mode: %s", pcap_geterr(pcap));
+		goto error;
+	}
+	r = pcap_setnonblock(pcap, 1, errbuf);
+	if (r) {
+		log_err("Failed to set nonblock: %s", errbuf);
+		goto error;
+	}
+	r = pcap_activate(pcap);
+	if (r) {
+		log_err("Failed to activate pcap: %s", pcap_geterr(pcap));
+		goto error;
+	}
+	/* Determine the format of the link-layer header */
+	r = pcap_set_datalink(pcap, DLT_LINUX_SLL2);
+	if (r) {
+		log_err("Failed to set datalink: %s", pcap_geterr(pcap));
+		goto error;
+	}
+
+	return pcap;
+error:
+	pcap_close(pcap);
+	return NULL;
+}
+
+static void encode_test_name(char *buf, size_t len, const char *test_name, const char *subtest_name)
+{
+	char *p;
+
+	if (subtest_name)
+		snprintf(buf, len, "%s__%s", test_name, subtest_name);
+	else
+		snprintf(buf, len, "%s", test_name);
+	while ((p = strchr(buf, '/')))
+		*p = '_';
+	while ((p = strchr(buf, ' ')))
+		*p = '_';
+}
+
+#define PCAP_DIR "/tmp/tmon_pcap"
+
+/* Start to monitor the network traffic in the given network namespace.
+ *
+ * netns: the name of the network namespace to monitor. If NULL, the
+ *        current network namespace is monitored.
+ * test_name: the name of the running test.
+ * subtest_name: the name of the running subtest if there is. It should be
+ *               NULL if it is not a subtest.
+ *
+ * This function will start a thread to capture packets going through NICs
+ * in the give network namespace.
+ */
+struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name,
+					   const char *subtest_name)
+{
+	struct nstoken *nstoken = NULL;
+	struct tmonitor_ctx *ctx;
+	char test_name_buf[64];
+	static int tmon_seq;
+	int r;
+
+	if (netns) {
+		nstoken = open_netns(netns);
+		if (!nstoken)
+			return NULL;
+	}
+	ctx = malloc(sizeof(*ctx));
+	if (!ctx) {
+		log_err("Failed to malloc ctx");
+		goto fail_ctx;
+	}
+	memset(ctx, 0, sizeof(*ctx));
+
+	encode_test_name(test_name_buf, sizeof(test_name_buf), test_name, subtest_name);
+	snprintf(ctx->pkt_fname, sizeof(ctx->pkt_fname),
+		 PCAP_DIR "/packets-%d-%d-%s-%s.log", getpid(), tmon_seq++,
+		 test_name_buf, netns ? netns : "unknown");
+
+	r = mkdir(PCAP_DIR, 0755);
+	if (r && errno != EEXIST) {
+		log_err("Failed to create " PCAP_DIR);
+		goto fail_pcap;
+	}
+
+	ctx->pcap = traffic_monitor_prepare_pcap();
+	if (!ctx->pcap)
+		goto fail_pcap;
+	ctx->pcap_fd = pcap_get_selectable_fd(ctx->pcap);
+	if (ctx->pcap_fd < 0) {
+		log_err("Failed to get pcap fd");
+		goto fail_dumper;
+	}
+
+	/* Create a packet file */
+	ctx->dumper = pcap_dump_open(ctx->pcap, ctx->pkt_fname);
+	if (!ctx->dumper) {
+		log_err("Failed to open pcap dump: %s", ctx->pkt_fname);
+		goto fail_dumper;
+	}
+
+	/* Create an eventfd to wake up the monitor thread */
+	ctx->wake_fd = eventfd(0, 0);
+	if (ctx->wake_fd < 0) {
+		log_err("Failed to create eventfd");
+		goto fail_eventfd;
+	}
+
+	r = pthread_create(&ctx->thread, NULL, traffic_monitor_thread, ctx);
+	if (r) {
+		log_err("Failed to create thread");
+		goto fail;
+	}
+
+	close_netns(nstoken);
+
+	return ctx;
+
+fail:
+	close(ctx->wake_fd);
+
+fail_eventfd:
+	pcap_dump_close(ctx->dumper);
+	unlink(ctx->pkt_fname);
+
+fail_dumper:
+	pcap_close(ctx->pcap);
+
+fail_pcap:
+	free(ctx);
+
+fail_ctx:
+	close_netns(nstoken);
+
+	return NULL;
+}
+
+static void traffic_monitor_release(struct tmonitor_ctx *ctx)
+{
+	pcap_close(ctx->pcap);
+	pcap_dump_close(ctx->dumper);
+
+	close(ctx->wake_fd);
+
+	free(ctx);
+}
+
+/* Stop the network traffic monitor.
+ *
+ * ctx: the context returned by traffic_monitor_start()
+ */
+void traffic_monitor_stop(struct tmonitor_ctx *ctx)
+{
+	__u64 w = 1;
+
+	if (!ctx)
+		return;
+
+	/* Stop the monitor thread */
+	ctx->done = true;
+	/* Wake up the background thread. */
+	write(ctx->wake_fd, &w, sizeof(w));
+	pthread_join(ctx->thread, NULL);
+
+	printf("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1);
+
+	traffic_monitor_release(ctx);
+}
+#endif /* TRAFFIC_MONITOR */
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index cce56955371f..0d032ae706c6 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -136,4 +136,22 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 	return csum_fold((__u32)s);
 }
 
+struct tmonitor_ctx;
+
+#ifdef TRAFFIC_MONITOR
+struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name,
+					   const char *subtest_name);
+void traffic_monitor_stop(struct tmonitor_ctx *ctx);
+#else
+static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name,
+							 const char *subtest_name)
+{
+	return NULL;
+}
+
+static inline void traffic_monitor_stop(struct tmonitor_ctx *ctx)
+{
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From f5281aacec856e7e0beb643d74122595fc1fb4be Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Wed, 14 Aug 2024 22:32:50 -0700
Subject: selftests/bpf: Add the traffic monitor option to test_progs.

Add option '-m' to test_progs to accept names and patterns of test cases.
This option will be used later to enable traffic monitor that capture
network packets generated by test cases.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240815053254.470944-3-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_progs.c | 86 +++++++++++++++++++++++---------
 tools/testing/selftests/bpf/test_progs.h |  2 +
 2 files changed, 64 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 60fafa2f1ed7..f8ed1a16a884 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -155,6 +155,7 @@ struct prog_test_def {
 	void (*run_serial_test)(void);
 	bool should_run;
 	bool need_cgroup_cleanup;
+	bool should_tmon;
 };
 
 /* Override C runtime library's usleep() implementation to ensure nanosleep()
@@ -192,46 +193,59 @@ static bool should_run(struct test_selector *sel, int num, const char *name)
 	return num < sel->num_set_len && sel->num_set[num];
 }
 
-static bool should_run_subtest(struct test_selector *sel,
-			       struct test_selector *subtest_sel,
-			       int subtest_num,
-			       const char *test_name,
-			       const char *subtest_name)
+static bool match_subtest(struct test_filter_set *filter,
+			  const char *test_name,
+			  const char *subtest_name)
 {
 	int i, j;
 
-	for (i = 0; i < sel->blacklist.cnt; i++) {
-		if (glob_match(test_name, sel->blacklist.tests[i].name)) {
-			if (!sel->blacklist.tests[i].subtest_cnt)
-				return false;
-
-			for (j = 0; j < sel->blacklist.tests[i].subtest_cnt; j++) {
-				if (glob_match(subtest_name,
-					       sel->blacklist.tests[i].subtests[j]))
-					return false;
-			}
-		}
-	}
-
-	for (i = 0; i < sel->whitelist.cnt; i++) {
-		if (glob_match(test_name, sel->whitelist.tests[i].name)) {
-			if (!sel->whitelist.tests[i].subtest_cnt)
+	for (i = 0; i < filter->cnt; i++) {
+		if (glob_match(test_name, filter->tests[i].name)) {
+			if (!filter->tests[i].subtest_cnt)
 				return true;
 
-			for (j = 0; j < sel->whitelist.tests[i].subtest_cnt; j++) {
+			for (j = 0; j < filter->tests[i].subtest_cnt; j++) {
 				if (glob_match(subtest_name,
-					       sel->whitelist.tests[i].subtests[j]))
+					       filter->tests[i].subtests[j]))
 					return true;
 			}
 		}
 	}
 
+	return false;
+}
+
+static bool should_run_subtest(struct test_selector *sel,
+			       struct test_selector *subtest_sel,
+			       int subtest_num,
+			       const char *test_name,
+			       const char *subtest_name)
+{
+	if (match_subtest(&sel->blacklist, test_name, subtest_name))
+		return false;
+
+	if (match_subtest(&sel->whitelist, test_name, subtest_name))
+		return true;
+
 	if (!sel->whitelist.cnt && !subtest_sel->num_set)
 		return true;
 
 	return subtest_num < subtest_sel->num_set_len && subtest_sel->num_set[subtest_num];
 }
 
+static bool should_tmon(struct test_selector *sel, const char *name)
+{
+	int i;
+
+	for (i = 0; i < sel->whitelist.cnt; i++) {
+		if (glob_match(name, sel->whitelist.tests[i].name) &&
+		    !sel->whitelist.tests[i].subtest_cnt)
+			return true;
+	}
+
+	return false;
+}
+
 static char *test_result(bool failed, bool skipped)
 {
 	return failed ? "FAIL" : (skipped ? "SKIP" : "OK");
@@ -488,6 +502,10 @@ bool test__start_subtest(const char *subtest_name)
 		return false;
 	}
 
+	subtest_state->should_tmon = match_subtest(&env.tmon_selector.whitelist,
+						   test->test_name,
+						   subtest_name);
+
 	env.subtest_state = subtest_state;
 	stdio_hijack_init(&subtest_state->log_buf, &subtest_state->log_cnt);
 
@@ -667,7 +685,8 @@ enum ARG_KEYS {
 	ARG_TEST_NAME_GLOB_DENYLIST = 'd',
 	ARG_NUM_WORKERS = 'j',
 	ARG_DEBUG = -1,
-	ARG_JSON_SUMMARY = 'J'
+	ARG_JSON_SUMMARY = 'J',
+	ARG_TRAFFIC_MONITOR = 'm',
 };
 
 static const struct argp_option opts[] = {
@@ -694,6 +713,10 @@ static const struct argp_option opts[] = {
 	{ "debug", ARG_DEBUG, NULL, 0,
 	  "print extra debug information for test_progs." },
 	{ "json-summary", ARG_JSON_SUMMARY, "FILE", 0, "Write report in json format to this file."},
+#ifdef TRAFFIC_MONITOR
+	{ "traffic-monitor", ARG_TRAFFIC_MONITOR, "NAMES", 0,
+	  "Monitor network traffic of tests with name matching the pattern (supports '*' wildcard)." },
+#endif
 	{},
 };
 
@@ -905,6 +928,18 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 		break;
 	case ARGP_KEY_END:
 		break;
+#ifdef TRAFFIC_MONITOR
+	case ARG_TRAFFIC_MONITOR:
+		if (arg[0] == '@')
+			err = parse_test_list_file(arg + 1,
+						   &env->tmon_selector.whitelist,
+						   true);
+		else
+			err = parse_test_list(arg,
+					      &env->tmon_selector.whitelist,
+					      true);
+		break;
+#endif
 	default:
 		return ARGP_ERR_UNKNOWN;
 	}
@@ -1736,6 +1771,8 @@ int main(int argc, char **argv)
 				test->test_num, test->test_name, test->test_name, test->test_name);
 			exit(EXIT_ERR_SETUP_INFRA);
 		}
+		if (test->should_run)
+			test->should_tmon = should_tmon(&env.tmon_selector, test->test_name);
 	}
 
 	/* ignore workers if we are just listing */
@@ -1820,6 +1857,7 @@ out:
 
 	free_test_selector(&env.test_selector);
 	free_test_selector(&env.subtest_selector);
+	free_test_selector(&env.tmon_selector);
 	free_test_states();
 
 	if (env.succ_cnt + env.fail_cnt + env.skip_cnt == 0)
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index cb9d6d46826b..966011eb7ec8 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -74,6 +74,7 @@ struct subtest_state {
 	int error_cnt;
 	bool skipped;
 	bool filtered;
+	bool should_tmon;
 
 	FILE *stdout_saved;
 };
@@ -98,6 +99,7 @@ struct test_state {
 struct test_env {
 	struct test_selector test_selector;
 	struct test_selector subtest_selector;
+	struct test_selector tmon_selector;
 	bool verifier_stats;
 	bool debug;
 	enum verbosity verbosity;
-- 
cgit v1.2.3


From 1e115a58be0ffca63727dc0495dae924a19f8cd4 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Wed, 14 Aug 2024 22:32:51 -0700
Subject: selftests/bpf: netns_new() and netns_free() helpers.

netns_new()/netns_free() create/delete network namespaces. They support the
option '-m' of test_progs to start/stop traffic monitor for the network
namespace being created for matched tests.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240815053254.470944-4-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.c | 46 ++++++++++++++
 tools/testing/selftests/bpf/network_helpers.h |  2 +
 tools/testing/selftests/bpf/test_progs.c      | 88 +++++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_progs.h      |  4 ++
 4 files changed, 140 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index e0b0a69f066e..27784946b01b 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -446,6 +446,52 @@ char *ping_command(int family)
 	return "ping";
 }
 
+int remove_netns(const char *name)
+{
+	char *cmd;
+	int r;
+
+	r = asprintf(&cmd, "ip netns del %s >/dev/null 2>&1", name);
+	if (r < 0) {
+		log_err("Failed to malloc cmd");
+		return -1;
+	}
+
+	r = system(cmd);
+	free(cmd);
+	return r;
+}
+
+int make_netns(const char *name)
+{
+	char *cmd;
+	int r;
+
+	r = asprintf(&cmd, "ip netns add %s", name);
+	if (r < 0) {
+		log_err("Failed to malloc cmd");
+		return -1;
+	}
+
+	r = system(cmd);
+	free(cmd);
+
+	if (r)
+		return r;
+
+	r = asprintf(&cmd, "ip -n %s link set lo up", name);
+	if (r < 0) {
+		log_err("Failed to malloc cmd for setting up lo");
+		remove_netns(name);
+		return -1;
+	}
+
+	r = system(cmd);
+	free(cmd);
+
+	return r;
+}
+
 struct nstoken {
 	int orig_netns_fd;
 };
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 0d032ae706c6..c72c16e1aff8 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -93,6 +93,8 @@ struct nstoken;
 struct nstoken *open_netns(const char *name);
 void close_netns(struct nstoken *token);
 int send_recv_data(int lfd, int fd, uint32_t total_bytes);
+int make_netns(const char *name);
+int remove_netns(const char *name);
 
 static __u16 csum_fold(__u32 csum)
 {
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index f8ed1a16a884..f45b06791444 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -18,6 +18,8 @@
 #include <bpf/btf.h>
 #include "json_writer.h"
 
+#include "network_helpers.h"
+
 #ifdef __GLIBC__
 #include <execinfo.h> /* backtrace */
 #endif
@@ -642,6 +644,92 @@ out:
 	return err;
 }
 
+struct netns_obj {
+	char *nsname;
+	struct tmonitor_ctx *tmon;
+	struct nstoken *nstoken;
+};
+
+/* Create a new network namespace with the given name.
+ *
+ * Create a new network namespace and set the network namespace of the
+ * current process to the new network namespace if the argument "open" is
+ * true. This function should be paired with netns_free() to release the
+ * resource and delete the network namespace.
+ *
+ * It also implements the functionality of the option "-m" by starting
+ * traffic monitor on the background to capture the packets in this network
+ * namespace if the current test or subtest matching the pattern.
+ *
+ * nsname: the name of the network namespace to create.
+ * open: open the network namespace if true.
+ *
+ * Return: the network namespace object on success, NULL on failure.
+ */
+struct netns_obj *netns_new(const char *nsname, bool open)
+{
+	struct netns_obj *netns_obj = malloc(sizeof(*netns_obj));
+	const char *test_name, *subtest_name;
+	int r;
+
+	if (!netns_obj)
+		return NULL;
+	memset(netns_obj, 0, sizeof(*netns_obj));
+
+	netns_obj->nsname = strdup(nsname);
+	if (!netns_obj->nsname)
+		goto fail;
+
+	/* Create the network namespace */
+	r = make_netns(nsname);
+	if (r)
+		goto fail;
+
+	/* Start traffic monitor */
+	if (env.test->should_tmon ||
+	    (env.subtest_state && env.subtest_state->should_tmon)) {
+		test_name = env.test->test_name;
+		subtest_name = env.subtest_state ? env.subtest_state->name : NULL;
+		netns_obj->tmon = traffic_monitor_start(nsname, test_name, subtest_name);
+		if (!netns_obj->tmon) {
+			fprintf(stderr, "Failed to start traffic monitor for %s\n", nsname);
+			goto fail;
+		}
+	} else {
+		netns_obj->tmon = NULL;
+	}
+
+	if (open) {
+		netns_obj->nstoken = open_netns(nsname);
+		if (!netns_obj->nstoken)
+			goto fail;
+	}
+
+	return netns_obj;
+fail:
+	traffic_monitor_stop(netns_obj->tmon);
+	remove_netns(nsname);
+	free(netns_obj->nsname);
+	free(netns_obj);
+	return NULL;
+}
+
+/* Delete the network namespace.
+ *
+ * This function should be paired with netns_new() to delete the namespace
+ * created by netns_new().
+ */
+void netns_free(struct netns_obj *netns_obj)
+{
+	if (!netns_obj)
+		return;
+	traffic_monitor_stop(netns_obj->tmon);
+	close_netns(netns_obj->nstoken);
+	remove_netns(netns_obj->nsname);
+	free(netns_obj->nsname);
+	free(netns_obj);
+}
+
 /* extern declarations for test funcs */
 #define DEFINE_TEST(name)				\
 	extern void test_##name(void) __weak;		\
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 966011eb7ec8..3ad131de14c6 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -430,6 +430,10 @@ int write_sysctl(const char *sysctl, const char *value);
 int get_bpf_max_tramp_links_from(struct btf *btf);
 int get_bpf_max_tramp_links(void);
 
+struct netns_obj;
+struct netns_obj *netns_new(const char *name, bool open);
+void netns_free(struct netns_obj *netns);
+
 #ifdef __x86_64__
 #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep"
 #elif defined(__s390x__)
-- 
cgit v1.2.3


From 52a5b8a30fa882c4d363f7679b7db4a093550ac1 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Wed, 14 Aug 2024 22:32:52 -0700
Subject: selftests/bpf: Monitor traffic for tc_redirect.

Enable traffic monitoring for the test case tc_redirect.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240815053254.470944-5-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/tc_redirect.c | 29 ++++++++++++++++------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 53b8ffc943dc..974f9d6269c9 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -68,6 +68,7 @@
 		__FILE__, __LINE__, strerror(errno), ##__VA_ARGS__)
 
 static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL};
+static struct netns_obj *netns_objs[3];
 
 static int write_file(const char *path, const char *newval)
 {
@@ -87,27 +88,41 @@ static int write_file(const char *path, const char *newval)
 
 static int netns_setup_namespaces(const char *verb)
 {
+	struct netns_obj **ns_obj = netns_objs;
 	const char * const *ns = namespaces;
-	char cmd[128];
 
 	while (*ns) {
-		snprintf(cmd, sizeof(cmd), "ip netns %s %s", verb, *ns);
-		if (!ASSERT_OK(system(cmd), cmd))
-			return -1;
+		if (strcmp(verb, "add") == 0) {
+			*ns_obj = netns_new(*ns, false);
+			if (!ASSERT_OK_PTR(*ns_obj, "netns_new"))
+				return -1;
+		} else {
+			if (!ASSERT_OK_PTR(*ns_obj, "netns_obj is NULL"))
+				return -1;
+			netns_free(*ns_obj);
+			*ns_obj = NULL;
+		}
 		ns++;
+		ns_obj++;
 	}
 	return 0;
 }
 
 static void netns_setup_namespaces_nofail(const char *verb)
 {
+	struct netns_obj **ns_obj = netns_objs;
 	const char * const *ns = namespaces;
-	char cmd[128];
 
 	while (*ns) {
-		snprintf(cmd, sizeof(cmd), "ip netns %s %s > /dev/null 2>&1", verb, *ns);
-		system(cmd);
+		if (strcmp(verb, "add") == 0) {
+			*ns_obj = netns_new(*ns, false);
+		} else {
+			if (*ns_obj)
+				netns_free(*ns_obj);
+			*ns_obj = NULL;
+		}
 		ns++;
+		ns_obj++;
 	}
 }
 
-- 
cgit v1.2.3


From b407b52b18505a7c09a77cbab022ec6cf82dc5f0 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Wed, 14 Aug 2024 22:32:53 -0700
Subject: selftests/bpf: Monitor traffic for sockmap_listen.

Enable traffic monitor for each subtest of sockmap_listen.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240815053254.470944-6-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index 9ce0e0e0b7da..ebb5f6336052 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1925,6 +1925,7 @@ static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map
 				int family)
 {
 	const char *family_name, *map_name;
+	struct netns_obj *netns;
 	char s[MAX_TEST_NAME];
 
 	family_name = family_str(family);
@@ -1932,8 +1933,15 @@ static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map
 	snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
 	if (!test__start_subtest(s))
 		return;
+
+	netns = netns_new("sockmap_listen", true);
+	if (!ASSERT_OK_PTR(netns, "netns_new"))
+		return;
+
 	inet_unix_skb_redir_to_connected(skel, map, family);
 	unix_inet_skb_redir_to_connected(skel, map, family);
+
+	netns_free(netns);
 }
 
 static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
-- 
cgit v1.2.3


From 69354085975ac61632ebceedebbeeeb0272620b8 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Wed, 14 Aug 2024 22:32:54 -0700
Subject: selftests/bpf: Monitor traffic for select_reuseport.

Enable traffic monitoring for the subtests of select_reuseport.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240815053254.470944-7-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/select_reuseport.c    | 37 ++++++++--------------
 1 file changed, 13 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
index 64c5f5eb2994..036d4760d2c1 100644
--- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
@@ -37,9 +37,7 @@ static int sk_fds[REUSEPORT_ARRAY_SIZE];
 static int reuseport_array = -1, outer_map = -1;
 static enum bpf_map_type inner_map_type;
 static int select_by_skb_data_prog;
-static int saved_tcp_syncookie = -1;
 static struct bpf_object *obj;
-static int saved_tcp_fo = -1;
 static __u32 index_zero;
 static int epfd;
 
@@ -193,14 +191,6 @@ static int write_int_sysctl(const char *sysctl, int v)
 	return 0;
 }
 
-static void restore_sysctls(void)
-{
-	if (saved_tcp_fo != -1)
-		write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo);
-	if (saved_tcp_syncookie != -1)
-		write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie);
-}
-
 static int enable_fastopen(void)
 {
 	int fo;
@@ -793,6 +783,7 @@ static void test_config(int sotype, sa_family_t family, bool inany)
 		TEST_INIT(test_pass_on_err),
 		TEST_INIT(test_detach_bpf),
 	};
+	struct netns_obj *netns;
 	char s[MAX_TEST_NAME];
 	const struct test *t;
 
@@ -808,9 +799,21 @@ static void test_config(int sotype, sa_family_t family, bool inany)
 		if (!test__start_subtest(s))
 			continue;
 
+		netns = netns_new("select_reuseport", true);
+		if (!ASSERT_OK_PTR(netns, "netns_new"))
+			continue;
+
+		if (CHECK_FAIL(enable_fastopen()))
+			goto out;
+		if (CHECK_FAIL(disable_syncookie()))
+			goto out;
+
 		setup_per_test(sotype, family, inany, t->no_inner_map);
 		t->fn(sotype, family);
 		cleanup_per_test(t->no_inner_map);
+
+out:
+		netns_free(netns);
 	}
 }
 
@@ -850,21 +853,7 @@ out:
 
 void serial_test_select_reuseport(void)
 {
-	saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL);
-	if (saved_tcp_fo < 0)
-		goto out;
-	saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL);
-	if (saved_tcp_syncookie < 0)
-		goto out;
-
-	if (enable_fastopen())
-		goto out;
-	if (disable_syncookie())
-		goto out;
-
 	test_map_type(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
 	test_map_type(BPF_MAP_TYPE_SOCKMAP);
 	test_map_type(BPF_MAP_TYPE_SOCKHASH);
-out:
-	restore_sysctls();
 }
-- 
cgit v1.2.3


From b53f20b323ee3c5daa5494ad69e3ae0e77cfd82a Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@linux.ibm.com>
Date: Thu, 15 Aug 2024 09:20:46 +0200
Subject: tools build: Provide consistent build options for fixdep

The fixdep binary is being compiled and linked in one step. While the
host linker flags are passed to the compiler the host compiler flags are
missed.

That leads to build errors at least on x86_64, arm64 and s390 as result
of the compiler vs linker flags inconsistency. For example, during RPM
package build redhat-hardened-ld script is provided to gcc, while
redhat-hardened-cc1 script is missed.

Provide both KBUILD_HOSTCFLAGS and KBUILD_HOSTLDFLAGS to avoid that.

Fixes: ea974028a049f2ce ("tools build: Avoid circular .fixdep-in.o.cmd issues")
Closes: https://lore.kernel.org/lkml/99ae0d34-ed76-4ca0-a9fd-c337da33c9f9@leemhuis.info/
Reported-by: Thorsten Leemhuis <linux@leemhuis.info>
Reviewed-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Tested-by: Thorsten Leemhuis <linux@leemhuis.info>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240815072046.1002837-1-agordeev@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/build/Makefile b/tools/build/Makefile
index fea3cf647f5b..18ad131f6ea7 100644
--- a/tools/build/Makefile
+++ b/tools/build/Makefile
@@ -44,4 +44,4 @@ ifneq ($(wildcard $(TMP_O)),)
 endif
 
 $(OUTPUT)fixdep: $(srctree)/tools/build/fixdep.c
-	$(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTLDFLAGS) -o $@ $<
+	$(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTCFLAGS) $(KBUILD_HOSTLDFLAGS) -o $@ $<
-- 
cgit v1.2.3


From fab45b962749184e1a1a57c7c583782b78fad539 Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Tue, 13 Aug 2024 20:49:06 +0100
Subject: libbpf: Workaround -Wmaybe-uninitialized false positive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In `elf_close`, we get this with GCC 15 -O3 (at least):
```
In function ‘elf_close’,
    inlined from ‘elf_close’ at elf.c:53:6,
    inlined from ‘elf_find_func_offset_from_file’ at elf.c:384:2:
elf.c:57:9: warning: ‘elf_fd.elf’ may be used uninitialized [-Wmaybe-uninitialized]
   57 |         elf_end(elf_fd->elf);
      |         ^~~~~~~~~~~~~~~~~~~~
elf.c: In function ‘elf_find_func_offset_from_file’:
elf.c:377:23: note: ‘elf_fd.elf’ was declared here
  377 |         struct elf_fd elf_fd;
      |                       ^~~~~~
In function ‘elf_close’,
    inlined from ‘elf_close’ at elf.c:53:6,
    inlined from ‘elf_find_func_offset_from_file’ at elf.c:384:2:
elf.c:58:9: warning: ‘elf_fd.fd’ may be used uninitialized [-Wmaybe-uninitialized]
   58 |         close(elf_fd->fd);
      |         ^~~~~~~~~~~~~~~~~
elf.c: In function ‘elf_find_func_offset_from_file’:
elf.c:377:23: note: ‘elf_fd.fd’ was declared here
  377 |         struct elf_fd elf_fd;
      |                       ^~~~~~
```

In reality, our use is fine, it's just that GCC doesn't model errno
here (see linked GCC bug). Suppress -Wmaybe-uninitialized accordingly
by initializing elf_fd.fd to -1 and elf_fd.elf to NULL.

I've done this in two other functions as well given it could easily
occur there too (same access/use pattern).

Signed-off-by: Sam James <sam@gentoo.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://gcc.gnu.org/PR114952
Link: https://lore.kernel.org/bpf/14ec488a1cac02794c2fa2b83ae0cef1bce2cb36.1723578546.git.sam@gentoo.org
---
 tools/lib/bpf/elf.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c
index c92e02394159..b5ab1cb13e5e 100644
--- a/tools/lib/bpf/elf.c
+++ b/tools/lib/bpf/elf.c
@@ -28,6 +28,9 @@ int elf_open(const char *binary_path, struct elf_fd *elf_fd)
 	int fd, ret;
 	Elf *elf;
 
+	elf_fd->elf = NULL;
+	elf_fd->fd = -1;
+
 	if (elf_version(EV_CURRENT) == EV_NONE) {
 		pr_warn("elf: failed to init libelf for %s\n", binary_path);
 		return -LIBBPF_ERRNO__LIBELF;
-- 
cgit v1.2.3


From e7d731326ef0622f103e5ed47d3405f71cdcd7f6 Mon Sep 17 00:00:00 2001
From: Abhash Jha <abhashkumarjha123@gmail.com>
Date: Wed, 14 Aug 2024 23:01:21 +0530
Subject: selftests/net/pmtu.sh: Fix typo in error message

The word 'expected' was spelled as 'exepcted'.
Fixed the typo in this patch.

Signed-off-by: Abhash Jha <abhashkumarjha123@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240814173121.33590-1-abhashkumarjha123@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/pmtu.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 5175c0c83a23..24a50622406c 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -1447,7 +1447,7 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() {
 		size=$(du -sb $tmpoutfile)
 		size=${size%%/tmp/*}
 
-		[ $size -ne 1048576 ] && err "File size $size mismatches exepcted value in locally bridged vxlan test" && return 1
+		[ $size -ne 1048576 ] && err "File size $size mismatches expected value in locally bridged vxlan test" && return 1
 	done
 
 	rm -f "$tmpoutfile"
-- 
cgit v1.2.3


From 0021d6670d1a997549a39bf629da9940bf4068ed Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Mon, 12 Aug 2024 22:50:17 +0200
Subject: tools/nolibc: crt: mark _start_c() as used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During LTO the reference from the asm startup code to the _start_c()
function is not visible and _start_c() is removed.
This will then lead to errors during linking.
As _start_c() is indeed always used, mark it as such.

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240812-nolibc-lto-v2-1-736af7bbefa8@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/crt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h
index ac291574f6c0..bbcd5fd09806 100644
--- a/tools/include/nolibc/crt.h
+++ b/tools/include/nolibc/crt.h
@@ -22,7 +22,7 @@ extern void (*const __init_array_end[])(int, char **, char**) __attribute__((wea
 extern void (*const __fini_array_start[])(void) __attribute__((weak));
 extern void (*const __fini_array_end[])(void) __attribute__((weak));
 
-__attribute__((weak))
+__attribute__((weak,used))
 void _start_c(long *sp)
 {
 	long argc;
-- 
cgit v1.2.3


From ff7b9abbfce985b92f71c855246508edb0980cd6 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Mon, 12 Aug 2024 22:50:18 +0200
Subject: tools/nolibc: stackprotector: mark implicitly used symbols as used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During LTO the references from the compiler-generated prologue and
epilogues to the stack protector symbols are not visible and the symbols
are removed.
This will then lead to errors during linking.
As those symbols are already #ifdeffed-out if unused mark them as "used"
to prevent their removal.

Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240812-nolibc-lto-v2-2-736af7bbefa8@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/stackprotector.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h
index 13f1d0e60387..1d0d5259ec41 100644
--- a/tools/include/nolibc/stackprotector.h
+++ b/tools/include/nolibc/stackprotector.h
@@ -18,7 +18,7 @@
  * triggering stack protector errors themselves
  */
 
-__attribute__((weak,noreturn,section(".text.nolibc_stack_chk")))
+__attribute__((weak,used,noreturn,section(".text.nolibc_stack_chk")))
 void __stack_chk_fail(void)
 {
 	pid_t pid;
@@ -34,7 +34,7 @@ void __stack_chk_fail_local(void)
 	__stack_chk_fail();
 }
 
-__attribute__((weak,section(".data.nolibc_stack_chk")))
+__attribute__((weak,used,section(".data.nolibc_stack_chk")))
 uintptr_t __stack_chk_guard;
 
 static __no_stack_protector void __stack_chk_init(void)
-- 
cgit v1.2.3


From 25fb329a23c78d59a055a7b1329d18f30a2be92d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Mon, 12 Aug 2024 22:50:19 +0200
Subject: tools/nolibc: x86_64: use local label in memcpy/memmove
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling arch-x86_64.h with clang and binutils LD yields
duplicate label errors:

.../gcc-13.2.0-nolibc/x86_64-linux/bin/x86_64-linux-ld: error: LLVM gold plugin: <inline asm>:44:1: symbol '.Lbackward_copy' is already defined
.Lbackward_copy:leaq -1(%rdi, %rcx, 1), %rdi

Instead of a local symbol use a local label which can be defined
multiple times and therefore avoids the error.

Reviewed-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://lore.kernel.org/r/20240812-nolibc-lto-v2-3-736af7bbefa8@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/arch-x86_64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index 65252c005a30..1e40620a2b33 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -193,10 +193,10 @@ __asm__ (
 	"movq %rdi, %rdx\n\t"
 	"subq %rsi, %rdx\n\t"
 	"cmpq %rcx, %rdx\n\t"
-	"jb   .Lbackward_copy\n\t"
+	"jb   1f\n\t"
 	"rep movsb\n\t"
 	"retq\n"
-".Lbackward_copy:"
+"1:" /* backward copy */
 	"leaq -1(%rdi, %rcx, 1), %rdi\n\t"
 	"leaq -1(%rsi, %rcx, 1), %rsi\n\t"
 	"std\n\t"
-- 
cgit v1.2.3


From 30dcdd6a3a6caa27d75196bcc4f6ea9c7a51ad19 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 14 Aug 2024 14:10:01 +0300
Subject: selftests: fib_rule_tests: Remove unused functions

The functions are unused since commit 816cda9ae531 ("selftests: net:
fib_rule_tests: add support to select a test to run"). Remove them.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20240814111005.955359-2-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 7c01f58a20de..c821d91b9ee8 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -56,14 +56,6 @@ log_test()
 	fi
 }
 
-log_section()
-{
-	echo
-	echo "######################################################################"
-	echo "TEST SECTION: $*"
-	echo "######################################################################"
-}
-
 check_nettest()
 {
 	if which nettest > /dev/null 2>&1; then
@@ -453,14 +445,6 @@ fib_rule4_connect_test()
 	$IP -4 rule del dsfield 0x04 table $RTABLE_PEER
 	cleanup_peer
 }
-
-run_fibrule_tests()
-{
-	log_section "IPv4 fib rule"
-	fib_rule4_test
-	log_section "IPv6 fib rule"
-	fib_rule6_test
-}
 ################################################################################
 # usage
 
-- 
cgit v1.2.3


From b1487d6abeb5ccfc23f61d600ca63d2b09964f9a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 14 Aug 2024 14:10:02 +0300
Subject: selftests: fib_rule_tests: Clarify test results

Clarify the test results by grouping the output of test cases belonging
to the same test under a common title. This is consistent with the
output of fib_tests.sh.

Before:

 # ./fib_rule_tests.sh

     TEST: rule6 check: oif redirect to table                            [ OK ]

     TEST: rule6 del by pref: oif redirect to table                      [ OK ]
 [...]
     TEST: rule4 check: oif redirect to table                            [ OK ]

     TEST: rule4 del by pref: oif redirect to table                      [ OK ]
 [...]

 Tests passed: 116
 Tests failed:   0

After:

 # ./fib_rule_tests.sh

 IPv6 FIB rule tests
     TEST: rule6 check: oif redirect to table                            [ OK ]
     TEST: rule6 del by pref: oif redirect to table                      [ OK ]
 [...]

 IPv4 FIB rule tests
     TEST: rule4 check: oif redirect to table                            [ OK ]
     TEST: rule4 del by pref: oif redirect to table                      [ OK ]
 [...]

 Tests passed: 116
 Tests failed:   0

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20240814111005.955359-3-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index c821d91b9ee8..9100dd4d0382 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -35,18 +35,13 @@ log_test()
 	local expected=$2
 	local msg="$3"
 
-	$IP rule show | grep -q l3mdev
-	if [ $? -eq 0 ]; then
-		msg="$msg (VRF)"
-	fi
-
 	if [ ${rc} -eq ${expected} ]; then
 		nsuccess=$((nsuccess+1))
-		printf "\n    TEST: %-60s  [ OK ]\n" "${msg}"
+		printf "    TEST: %-60s  [ OK ]\n" "${msg}"
 	else
 		ret=1
 		nfail=$((nfail+1))
-		printf "\n    TEST: %-60s  [FAIL]\n" "${msg}"
+		printf "    TEST: %-60s  [FAIL]\n" "${msg}"
 		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
 			echo
 			echo "hit enter to continue, 'q' to quit"
@@ -205,10 +200,14 @@ fib_rule6_test_reject()
 
 fib_rule6_test()
 {
+	local ext_name=$1; shift
 	local getmatch
 	local match
 	local cnt
 
+	echo
+	echo "IPv6 FIB rule tests $ext_name"
+
 	# setup the fib rule redirect route
 	$IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
 
@@ -267,7 +266,7 @@ fib_rule6_test()
 fib_rule6_vrf_test()
 {
 	setup_vrf
-	fib_rule6_test
+	fib_rule6_test "- with VRF"
 	cleanup_vrf
 }
 
@@ -277,6 +276,9 @@ fib_rule6_connect_test()
 {
 	local dsfield
 
+	echo
+	echo "IPv6 FIB rule connect tests"
+
 	if ! check_nettest; then
 		echo "SKIP: Could not run test without nettest tool"
 		return
@@ -344,10 +346,14 @@ fib_rule4_test_reject()
 
 fib_rule4_test()
 {
+	local ext_name=$1; shift
 	local getmatch
 	local match
 	local cnt
 
+	echo
+	echo "IPv4 FIB rule tests $ext_name"
+
 	# setup the fib rule redirect route
 	$IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
 
@@ -411,7 +417,7 @@ fib_rule4_test()
 fib_rule4_vrf_test()
 {
 	setup_vrf
-	fib_rule4_test
+	fib_rule4_test "- with VRF"
 	cleanup_vrf
 }
 
@@ -421,6 +427,9 @@ fib_rule4_connect_test()
 {
 	local dsfield
 
+	echo
+	echo "IPv4 FIB rule connect tests"
+
 	if ! check_nettest; then
 		echo "SKIP: Could not run test without nettest tool"
 		return
-- 
cgit v1.2.3


From 9b6dcef32c2d8fe357592da5f6f52ef2edf1652d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 14 Aug 2024 14:10:03 +0300
Subject: selftests: fib_rule_tests: Add negative match tests

The fib_rule{4,6} tests verify the behavior of a given FIB rule selector
(e.g., dport, sport) by redirecting to a routing table with a default
route using a FIB rule with the given selector and checking that a route
lookup using the selector matches this default route.

Add negative tests to verify that a FIB rule is not hit when it should
not.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20240814111005.955359-4-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 87 +++++++++++++++++++++------
 1 file changed, 69 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 9100dd4d0382..085e21ed9fc3 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -174,12 +174,17 @@ fib_rule6_test_match_n_redirect()
 {
 	local match="$1"
 	local getmatch="$2"
-	local description="$3"
+	local getnomatch="$3"
+	local description="$4"
+	local nomatch_description="$5"
 
 	$IP -6 rule add $match table $RTABLE
 	$IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
 	log_test $? 0 "rule6 check: $description"
 
+	$IP -6 route get $GW_IP6 $getnomatch 2>&1 | grep -q "table $RTABLE"
+	log_test $? 1 "rule6 check: $nomatch_description"
+
 	fib_rule6_del_by_pref "$match"
 	log_test $? 0 "rule6 del by pref: $description"
 }
@@ -201,6 +206,7 @@ fib_rule6_test_reject()
 fib_rule6_test()
 {
 	local ext_name=$1; shift
+	local getnomatch
 	local getmatch
 	local match
 	local cnt
@@ -212,10 +218,14 @@ fib_rule6_test()
 	$IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
 
 	match="oif $DEV"
-	fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table"
+	getnomatch="oif lo"
+	fib_rule6_test_match_n_redirect "$match" "$match" "$getnomatch" \
+		"oif redirect to table" "oif no redirect to table"
 
 	match="from $SRC_IP6 iif $DEV"
-	fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+	getnomatch="from $SRC_IP6 iif lo"
+	fib_rule6_test_match_n_redirect "$match" "$match" "$getnomatch" \
+		"iif redirect to table" "iif no redirect to table"
 
 	# Reject dsfield (tos) options which have ECN bits set
 	for cnt in $(seq 1 3); do
@@ -229,37 +239,52 @@ fib_rule6_test()
 		# Using option 'tos' instead of 'dsfield' as old iproute2
 		# versions don't support 'dsfield' in ip rule show.
 		getmatch="tos $cnt"
+		getnomatch="tos 0x20"
 		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
-						"$getmatch redirect to table"
+			"$getnomatch" "$getmatch redirect to table" \
+			"$getnomatch no redirect to table"
 	done
 
 	match="fwmark 0x64"
 	getmatch="mark 0x64"
-	fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+	getnomatch="mark 0x63"
+	fib_rule6_test_match_n_redirect "$match" "$getmatch" "$getnomatch" \
+		"fwmark redirect to table" "fwmark no redirect to table"
 
 	fib_check_iproute_support "uidrange" "uid"
 	if [ $? -eq 0 ]; then
 		match="uidrange 100-100"
 		getmatch="uid 100"
-		fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+		getnomatch="uid 101"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "uid redirect to table" \
+			"uid no redirect to table"
 	fi
 
 	fib_check_iproute_support "sport" "sport"
 	if [ $? -eq 0 ]; then
 		match="sport 666 dport 777"
-		fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+		getnomatch="sport 667 dport 778"
+		fib_rule6_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "sport and dport redirect to table" \
+			"sport and dport no redirect to table"
 	fi
 
 	fib_check_iproute_support "ipproto" "ipproto"
 	if [ $? -eq 0 ]; then
 		match="ipproto tcp"
-		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match"
+		getnomatch="ipproto udp"
+		fib_rule6_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "ipproto tcp match" "ipproto udp no match"
 	fi
 
 	fib_check_iproute_support "ipproto" "ipproto"
 	if [ $? -eq 0 ]; then
 		match="ipproto ipv6-icmp"
-		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto ipv6-icmp match"
+		getnomatch="ipproto tcp"
+		fib_rule6_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "ipproto ipv6-icmp match" \
+			"ipproto ipv6-tcp no match"
 	fi
 }
 
@@ -320,12 +345,17 @@ fib_rule4_test_match_n_redirect()
 {
 	local match="$1"
 	local getmatch="$2"
-	local description="$3"
+	local getnomatch="$3"
+	local description="$4"
+	local nomatch_description="$5"
 
 	$IP rule add $match table $RTABLE
 	$IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
 	log_test $? 0 "rule4 check: $description"
 
+	$IP route get $GW_IP4 $getnomatch 2>&1 | grep -q "table $RTABLE"
+	log_test $? 1 "rule4 check: $nomatch_description"
+
 	fib_rule4_del_by_pref "$match"
 	log_test $? 0 "rule4 del by pref: $description"
 }
@@ -347,6 +377,7 @@ fib_rule4_test_reject()
 fib_rule4_test()
 {
 	local ext_name=$1; shift
+	local getnomatch
 	local getmatch
 	local match
 	local cnt
@@ -358,14 +389,18 @@ fib_rule4_test()
 	$IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
 
 	match="oif $DEV"
-	fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table"
+	getnomatch="oif lo"
+	fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \
+		"oif redirect to table" "oif no redirect to table"
 
 	# need enable forwarding and disable rp_filter temporarily as all the
 	# addresses are in the same subnet and egress device == ingress device.
 	ip netns exec $testns sysctl -qw net.ipv4.ip_forward=1
 	ip netns exec $testns sysctl -qw net.ipv4.conf.$DEV.rp_filter=0
 	match="from $SRC_IP iif $DEV"
-	fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
+	getnomatch="from $SRC_IP iif lo"
+	fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \
+		"iif redirect to table" "iif no redirect to table"
 	ip netns exec $testns sysctl -qw net.ipv4.ip_forward=0
 
 	# Reject dsfield (tos) options which have ECN bits set
@@ -380,37 +415,53 @@ fib_rule4_test()
 		# Using option 'tos' instead of 'dsfield' as old iproute2
 		# versions don't support 'dsfield' in ip rule show.
 		getmatch="tos $cnt"
+		getnomatch="tos 0x20"
 		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
-						"$getmatch redirect to table"
+			"$getnomatch" "$getmatch redirect to table" \
+			"$getnomatch no redirect to table"
 	done
 
 	match="fwmark 0x64"
 	getmatch="mark 0x64"
-	fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+	getnomatch="mark 0x63"
+	fib_rule4_test_match_n_redirect "$match" "$getmatch" "$getnomatch" \
+		"fwmark redirect to table" "fwmark no redirect to table"
 
 	fib_check_iproute_support "uidrange" "uid"
 	if [ $? -eq 0 ]; then
 		match="uidrange 100-100"
 		getmatch="uid 100"
-		fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+		getnomatch="uid 101"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "uid redirect to table" \
+			"uid no redirect to table"
 	fi
 
 	fib_check_iproute_support "sport" "sport"
 	if [ $? -eq 0 ]; then
 		match="sport 666 dport 777"
-		fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+		getnomatch="sport 667 dport 778"
+		fib_rule4_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "sport and dport redirect to table" \
+			"sport and dport no redirect to table"
 	fi
 
 	fib_check_iproute_support "ipproto" "ipproto"
 	if [ $? -eq 0 ]; then
 		match="ipproto tcp"
-		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match"
+		getnomatch="ipproto udp"
+		fib_rule4_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "ipproto tcp match" \
+			"ipproto udp no match"
 	fi
 
 	fib_check_iproute_support "ipproto" "ipproto"
 	if [ $? -eq 0 ]; then
 		match="ipproto icmp"
-		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+		getnomatch="ipproto tcp"
+		fib_rule4_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "ipproto icmp match" \
+			"ipproto tcp no match"
 	fi
 }
 
-- 
cgit v1.2.3


From 53f88ed85bdd194b8a30605a264692b28a3ee665 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 14 Aug 2024 14:10:04 +0300
Subject: selftests: fib_rule_tests: Add negative connect tests

The fib_rule{4,6}_connect tests verify that locally generated traffic
from a socket that specifies a DS Field using the IP_TOS / IPV6_TCLASS
socket options is correctly redirected using a FIB rule that matches on
the given DS Field.

Add negative tests to verify that the FIB rule is not hit when the
socket specifies a DS Field that differs from the one used by the FIB
rule.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20240814111005.955359-5-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 085e21ed9fc3..a3b2c833f050 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -325,6 +325,16 @@ fib_rule6_connect_test()
 		log_test $? 0 "rule6 dsfield tcp connect (dsfield ${dsfield})"
 	done
 
+	# Check that UDP and TCP connections fail when using a DS Field that
+	# does not match the previously configured FIB rule.
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D \
+		-Q 0x20 -l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 1 "rule6 dsfield udp no connect (dsfield 0x20)"
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0x20 \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 1 "rule6 dsfield tcp no connect (dsfield 0x20)"
+
 	$IP -6 rule del dsfield 0x04 table $RTABLE_PEER
 	cleanup_peer
 }
@@ -502,6 +512,16 @@ fib_rule4_connect_test()
 		log_test $? 0 "rule4 dsfield tcp connect (dsfield ${dsfield})"
 	done
 
+	# Check that UDP and TCP connections fail when using a DS Field that
+	# does not match the previously configured FIB rule.
+	nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0x20 \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 1 "rule4 dsfield udp no connect (dsfield 0x20)"
+
+	nettest -q -B -t 5 -N $testns -O $peerns -Q 0x20 \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 1 "rule4 dsfield tcp no connect (dsfield 0x20)"
+
 	$IP -4 rule del dsfield 0x04 table $RTABLE_PEER
 	cleanup_peer
 }
-- 
cgit v1.2.3


From 5f1b4f1be2d2b8d85dd07352b097cfc7064ad2e5 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 14 Aug 2024 14:10:05 +0300
Subject: selftests: fib_rule_tests: Test TOS matching with input routes

The TOS value reaches the FIB rule core via different call paths when an
input route is looked up compared to an output route.

Re-test TOS matching with input routes to exercise these code paths.

Pass the 'iif' and 'from' selectors separately from the 'get{,no}match'
variables as otherwise the test name is too long to be printed without
misalignments.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20240814111005.955359-6-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 31 ++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index a3b2c833f050..89034c5b69dc 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -245,6 +245,19 @@ fib_rule6_test()
 			"$getnomatch no redirect to table"
 	done
 
+	# Re-test TOS matching, but with input routes since they are handled
+	# differently from output routes.
+	match="tos 0x10"
+	for cnt in "0x10" "0x11" "0x12" "0x13"; do
+		getmatch="tos $cnt"
+		getnomatch="tos 0x20"
+		fib_rule6_test_match_n_redirect "$match" \
+			"from $SRC_IP6 iif $DEV $getmatch" \
+			"from $SRC_IP6 iif $DEV $getnomatch" \
+			"iif $getmatch redirect to table" \
+			"iif $getnomatch no redirect to table"
+	done
+
 	match="fwmark 0x64"
 	getmatch="mark 0x64"
 	getnomatch="mark 0x63"
@@ -403,15 +416,14 @@ fib_rule4_test()
 	fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \
 		"oif redirect to table" "oif no redirect to table"
 
-	# need enable forwarding and disable rp_filter temporarily as all the
-	# addresses are in the same subnet and egress device == ingress device.
+	# Enable forwarding and disable rp_filter as all the addresses are in
+	# the same subnet and egress device == ingress device.
 	ip netns exec $testns sysctl -qw net.ipv4.ip_forward=1
 	ip netns exec $testns sysctl -qw net.ipv4.conf.$DEV.rp_filter=0
 	match="from $SRC_IP iif $DEV"
 	getnomatch="from $SRC_IP iif lo"
 	fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \
 		"iif redirect to table" "iif no redirect to table"
-	ip netns exec $testns sysctl -qw net.ipv4.ip_forward=0
 
 	# Reject dsfield (tos) options which have ECN bits set
 	for cnt in $(seq 1 3); do
@@ -431,6 +443,19 @@ fib_rule4_test()
 			"$getnomatch no redirect to table"
 	done
 
+	# Re-test TOS matching, but with input routes since they are handled
+	# differently from output routes.
+	match="tos 0x10"
+	for cnt in "0x10" "0x11" "0x12" "0x13"; do
+		getmatch="tos $cnt"
+		getnomatch="tos 0x20"
+		fib_rule4_test_match_n_redirect "$match" \
+			"from $SRC_IP iif $DEV $getmatch" \
+			"from $SRC_IP iif $DEV $getnomatch" \
+			"iif $getmatch redirect to table" \
+			"iif $getnomatch no redirect to table"
+	done
+
 	match="fwmark 0x64"
 	getmatch="mark 0x64"
 	getnomatch="mark 0x63"
-- 
cgit v1.2.3


From 6bdf5168b6fb19541b0c1862bdaa596d116c7bfb Mon Sep 17 00:00:00 2001
From: Yang Jihong <yangjihong@bytedance.com>
Date: Tue, 6 Aug 2024 10:35:33 +0800
Subject: perf sched timehist: Fix missing free of session in
 perf_sched__timehist()

When perf_time__parse_str() fails in perf_sched__timehist(),
need to free session that was previously created, fix it.

Fixes: 853b74071110bed3 ("perf sched timehist: Add option to specify time window of interest")
Signed-off-by: Yang Jihong <yangjihong@bytedance.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240806023533.1316348-1-yangjihong@bytedance.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 34fe8e540c43..9c1276dc8ef6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -3120,7 +3120,8 @@ static int perf_sched__timehist(struct perf_sched *sched)
 
 	if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) {
 		pr_err("Invalid time string\n");
-		return -EINVAL;
+		err = -EINVAL;
+		goto out;
 	}
 
 	if (timehist_check_attr(sched, evlist) != 0)
-- 
cgit v1.2.3


From 2615639352420e6e3115952c5b8f46846e1c6d0e Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Fri, 2 Aug 2024 14:58:00 +0800
Subject: perf stat: Display iostat headers correctly

Currently we'll only print metric headers for metric leader in
aggregration mode. This will make `perf iostat` header not shown
since it'll aggregrated globally but don't have metric events:

  root@ubuntu204:/home/yang/linux/tools/perf# ./perf stat --iostat --timeout 1000
   Performance counter stats for 'system wide':
      port
  0000:00                    0                    0                    0                    0
  0000:80                    0                    0                    0                    0
  [...]

Fix this by excluding the iostat in the check of printing metric
headers. Then we can see the headers:

  root@ubuntu204:/home/yang/linux/tools/perf# ./perf stat --iostat --timeout 1000
   Performance counter stats for 'system wide':
      port             Inbound Read(MB)    Inbound Write(MB)    Outbound Read(MB)   Outbound Write(MB)
  0000:00                    0                    0                    0                    0
  0000:80                    0                    0                    0                    0
  [...]

Fixes: 193a9e30207f5477 ("perf stat: Don't display metric header for non-leader uncore events")
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: linuxarm@huawei.com
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
Cc: Zeng Tao <prime.zeng@hisilicon.com>
Link: https://lore.kernel.org/r/20240802065800.48774-1-yangyicong@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index c38bcb6f4c78..ea96e4ebad8c 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -1237,7 +1237,8 @@ static void print_metric_headers(struct perf_stat_config *config,
 
 	/* Print metrics headers only */
 	evlist__for_each_entry(evlist, counter) {
-		if (config->aggr_mode != AGGR_NONE && counter->metric_leader != counter)
+		if (!config->iostat_run &&
+		    config->aggr_mode != AGGR_NONE && counter->metric_leader != counter)
 			continue;
 
 		os.evsel = counter;
-- 
cgit v1.2.3


From 1026392d10af7b35361c7ec4e14569d88c0c33e4 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <ivan.orlov0322@gmail.com>
Date: Tue, 13 Aug 2024 13:07:01 +0100
Subject: selftests: ALSA: Cover userspace-driven timers with test

Add a test for the new functionality of userspace-driven timers and the
tool which allows us to count timer ticks in a certain time period. The
test:

1. Creates a userspace-driven timer with ioctl to /dev/snd/timer
2. Starts the `global-timer` application to count the ticks of the timer
from step 1.
3. Asynchronously triggers the timer multiple times with some interval
4. Compares the amount of caught ticks with the amount of trigger calls.

Since we can't include <alsa/asoundlib.h> and <sound/asound.h> in one
file due to overlapping declarations, I have to split the test into two
applications: one of them counts the amount of timer ticks in the
defined time period, and another one is the actual test which creates
the timer, triggers it periodically and starts the first app to count
the amount of ticks in a separate thread.

Besides from testing the functionality itself, the test represents a
sample application showing userspace-driven ALSA timers API.

Also, the timer test includes a test case which tries to create a timer
with invalid resolution (=0), and NULL as a timer info structure.

Signed-off-by: Ivan Orlov <ivan.orlov0322@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20240813120701.171743-5-ivan.orlov0322@gmail.com
---
 tools/testing/selftests/alsa/Makefile       |   4 +-
 tools/testing/selftests/alsa/global-timer.c |  87 +++++++++++++++
 tools/testing/selftests/alsa/utimer-test.c  | 164 ++++++++++++++++++++++++++++
 3 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/alsa/global-timer.c
 create mode 100644 tools/testing/selftests/alsa/utimer-test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile
index c1ce39874e2b..25be68025290 100644
--- a/tools/testing/selftests/alsa/Makefile
+++ b/tools/testing/selftests/alsa/Makefile
@@ -12,9 +12,9 @@ LDLIBS+=-lpthread
 
 OVERRIDE_TARGETS = 1
 
-TEST_GEN_PROGS := mixer-test pcm-test test-pcmtest-driver
+TEST_GEN_PROGS := mixer-test pcm-test test-pcmtest-driver utimer-test
 
-TEST_GEN_PROGS_EXTENDED := libatest.so
+TEST_GEN_PROGS_EXTENDED := libatest.so global-timer
 
 TEST_FILES := conf.d pcm-test.conf
 
diff --git a/tools/testing/selftests/alsa/global-timer.c b/tools/testing/selftests/alsa/global-timer.c
new file mode 100644
index 000000000000..c15ec0ba851a
--- /dev/null
+++ b/tools/testing/selftests/alsa/global-timer.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This tool is used by the utimer test, and it allows us to
+ * count the ticks of a global timer in a certain time frame
+ * (which is set by `timeout` parameter).
+ *
+ * Author: Ivan Orlov <ivan.orlov0322@gmail.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <alsa/asoundlib.h>
+#include <time.h>
+
+static int ticked;
+static void async_callback(snd_async_handler_t *ahandler)
+{
+	ticked++;
+}
+
+static char timer_name[64];
+static void bind_to_timer(int device, int subdevice, int timeout)
+{
+	snd_timer_t *handle;
+	snd_timer_params_t *params;
+	snd_async_handler_t *ahandler;
+
+	time_t end;
+
+	sprintf(timer_name, "hw:CLASS=%d,SCLASS=%d,DEV=%d,SUBDEV=%d",
+		SND_TIMER_CLASS_GLOBAL, SND_TIMER_SCLASS_NONE,
+		device, subdevice);
+
+	snd_timer_params_alloca(&params);
+
+	if (snd_timer_open(&handle, timer_name, SND_TIMER_OPEN_NONBLOCK) < 0) {
+		perror("Can't open the timer");
+		exit(EXIT_FAILURE);
+	}
+
+	snd_timer_params_set_auto_start(params, 1);
+	snd_timer_params_set_ticks(params, 1);
+	if (snd_timer_params(handle, params) < 0) {
+		perror("Can't set timer params");
+		exit(EXIT_FAILURE);
+	}
+
+	if (snd_async_add_timer_handler(&ahandler, handle, async_callback, NULL) < 0) {
+		perror("Can't create a handler");
+		exit(EXIT_FAILURE);
+	}
+	end = time(NULL) + timeout;
+	if (snd_timer_start(handle) < 0) {
+		perror("Failed to start the timer");
+		exit(EXIT_FAILURE);
+	}
+	printf("Timer has started\n");
+	while (time(NULL) <= end) {
+		/*
+		 * Waiting for the timeout to elapse. Can't use sleep here, as it gets
+		 * constantly interrupted by the signal from the timer (SIGIO)
+		 */
+	}
+	snd_timer_stop(handle);
+	snd_timer_close(handle);
+}
+
+int main(int argc, char *argv[])
+{
+	int device, subdevice, timeout;
+
+	if (argc < 4) {
+		perror("Usage: %s <device> <subdevice> <timeout>");
+		return EXIT_FAILURE;
+	}
+
+	setlinebuf(stdout);
+
+	device = atoi(argv[1]);
+	subdevice = atoi(argv[2]);
+	timeout = atoi(argv[3]);
+
+	bind_to_timer(device, subdevice, timeout);
+
+	printf("Total ticks count: %d\n", ticked);
+
+	return EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/alsa/utimer-test.c b/tools/testing/selftests/alsa/utimer-test.c
new file mode 100644
index 000000000000..32ee3ce57721
--- /dev/null
+++ b/tools/testing/selftests/alsa/utimer-test.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This test covers the functionality of userspace-driven ALSA timers. Such timers
+ * are purely virtual (so they don't directly depend on the hardware), and they could be
+ * created and triggered by userspace applications.
+ *
+ * Author: Ivan Orlov <ivan.orlov0322@gmail.com>
+ */
+#include "../kselftest_harness.h"
+#include <sound/asound.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/ioctl.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#define FRAME_RATE 8000
+#define PERIOD_SIZE 4410
+#define UTIMER_DEFAULT_ID -1
+#define UTIMER_DEFAULT_FD -1
+#define NANO 1000000000ULL
+#define TICKS_COUNT 10
+#define TICKS_RECORDING_DELTA 5
+#define TIMER_OUTPUT_BUF_LEN 1024
+#define TIMER_FREQ_SEC 1
+#define RESULT_PREFIX_LEN strlen("Total ticks count: ")
+
+enum timer_app_event {
+	TIMER_APP_STARTED,
+	TIMER_APP_RESULT,
+	TIMER_NO_EVENT,
+};
+
+FIXTURE(timer_f) {
+	struct snd_timer_uinfo *utimer_info;
+};
+
+FIXTURE_SETUP(timer_f) {
+	int timer_dev_fd;
+
+	if (geteuid())
+		SKIP(return, "This test needs root to run!");
+
+	self->utimer_info = calloc(1, sizeof(*self->utimer_info));
+	ASSERT_NE(NULL, self->utimer_info);
+
+	/* Resolution is the time the period of frames takes in nanoseconds */
+	self->utimer_info->resolution = (NANO / FRAME_RATE * PERIOD_SIZE);
+
+	timer_dev_fd = open("/dev/snd/timer", O_RDONLY);
+	ASSERT_GE(timer_dev_fd, 0);
+
+	ASSERT_EQ(ioctl(timer_dev_fd, SNDRV_TIMER_IOCTL_CREATE, self->utimer_info), 0);
+	ASSERT_GE(self->utimer_info->fd, 0);
+
+	close(timer_dev_fd);
+}
+
+FIXTURE_TEARDOWN(timer_f) {
+	close(self->utimer_info->fd);
+	free(self->utimer_info);
+}
+
+static void *ticking_func(void *data)
+{
+	int i;
+	int *fd = (int *)data;
+
+	for (i = 0; i < TICKS_COUNT; i++) {
+		/* Well, trigger the timer! */
+		ioctl(*fd, SNDRV_TIMER_IOCTL_TRIGGER, NULL);
+		sleep(TIMER_FREQ_SEC);
+	}
+
+	return NULL;
+}
+
+static enum timer_app_event parse_timer_output(const char *s)
+{
+	if (strstr(s, "Timer has started"))
+		return TIMER_APP_STARTED;
+	if (strstr(s, "Total ticks count"))
+		return TIMER_APP_RESULT;
+
+	return TIMER_NO_EVENT;
+}
+
+static int parse_timer_result(const char *s)
+{
+	char *end;
+	long d;
+
+	d = strtol(s + RESULT_PREFIX_LEN, &end, 10);
+	if (end == s + RESULT_PREFIX_LEN)
+		return -1;
+
+	return d;
+}
+
+/*
+ * This test triggers the timer and counts ticks at the same time. The amount
+ * of the timer trigger calls should be equal to the amount of ticks received.
+ */
+TEST_F(timer_f, utimer) {
+	char command[64];
+	pthread_t ticking_thread;
+	int total_ticks = 0;
+	FILE *rfp;
+	char *buf = malloc(TIMER_OUTPUT_BUF_LEN);
+
+	ASSERT_NE(buf, NULL);
+
+	/* The timeout should be the ticks interval * count of ticks + some delta */
+	sprintf(command, "./global-timer %d %d %d", SNDRV_TIMER_GLOBAL_UDRIVEN,
+		self->utimer_info->id, TICKS_COUNT * TIMER_FREQ_SEC + TICKS_RECORDING_DELTA);
+
+	rfp = popen(command, "r");
+	while (fgets(buf, TIMER_OUTPUT_BUF_LEN, rfp)) {
+		buf[TIMER_OUTPUT_BUF_LEN - 1] = 0;
+		switch (parse_timer_output(buf)) {
+		case TIMER_APP_STARTED:
+			/* global-timer waits for timer to trigger, so start the ticking thread */
+			pthread_create(&ticking_thread, NULL, ticking_func,
+				       &self->utimer_info->fd);
+			break;
+		case TIMER_APP_RESULT:
+			total_ticks = parse_timer_result(buf);
+			break;
+		case TIMER_NO_EVENT:
+			break;
+		}
+	}
+	pthread_join(ticking_thread, NULL);
+	ASSERT_EQ(total_ticks, TICKS_COUNT);
+	pclose(rfp);
+}
+
+TEST(wrong_timers_test) {
+	int timer_dev_fd;
+	int utimer_fd;
+	size_t i;
+	struct snd_timer_uinfo wrong_timer = {
+		.resolution = 0,
+		.id = UTIMER_DEFAULT_ID,
+		.fd = UTIMER_DEFAULT_FD,
+	};
+
+	timer_dev_fd = open("/dev/snd/timer", O_RDONLY);
+	ASSERT_GE(timer_dev_fd, 0);
+
+	utimer_fd = ioctl(timer_dev_fd, SNDRV_TIMER_IOCTL_CREATE, &wrong_timer);
+	ASSERT_LT(utimer_fd, 0);
+	/* Check that id was not updated */
+	ASSERT_EQ(wrong_timer.id, UTIMER_DEFAULT_ID);
+
+	/* Test the NULL as an argument is processed correctly */
+	ASSERT_LT(ioctl(timer_dev_fd, SNDRV_TIMER_IOCTL_CREATE, NULL), 0);
+
+	close(timer_dev_fd);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 56d680dd23c38067a32fb8aeb74d6ce838fcf26c Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 25 Jul 2024 20:33:22 +0200
Subject: objtool/rust: list `noreturn` Rust functions

Rust functions may be `noreturn` (i.e. diverging) by returning the
"never" type, `!`, e.g.

    fn f() -> ! {
        loop {}
    }

Thus list the known `noreturn` functions to avoid such warnings.

Without this, `objtool` would complain if enabled for Rust, e.g.:

    rust/core.o: warning: objtool:
    _R...9panic_fmt() falls through to next function _R...18panic_nounwind_fmt()

    rust/alloc.o: warning: objtool:
    .text: unexpected end of section

In order to do so, we cannot match symbols' names exactly, for two
reasons:

  - Rust mangling scheme [1] contains disambiguators [2] which we
    cannot predict (e.g. they may vary depending on the compiler version).

    One possibility to solve this would be to parse v0 and ignore/zero
    those before comparison.

  - Some of the diverging functions come from `core`, i.e. the Rust
    standard library, which may change with each compiler version
    since they are implementation details (e.g. `panic_internals`).

Thus, to workaround both issues, only part of the symbols are matched,
instead of using the `NORETURN` macro in `noreturns.h`.

Ideally, just like for the C side, we should have a better solution. For
instance, the compiler could give us the list via something like:

    $ rustc --emit=noreturns ...

[ Kees agrees this should be automated and Peter says:

    So it would be fairly simple to make objtool consume a magic section
    emitted by the compiler.. I think we've asked the compiler folks
    for that at some point even, but I don't have clear recollections.

  We will ask upstream Rust about it. And if they agree, then perhaps
  we can get Clang/GCC to implement something similar too -- for this
  sort of thing we can take advantage of the shorter cycles of `rustc`
  as well as their unstable features concept to experiment.

  Gary proposed using DWARF (though it would need to be available), and
  wrote a proof of concept script using the `object` and `gimli` crates:
  https://gist.github.com/nbdd0121/449692570622c2f46a29ad9f47c3379a

    - Miguel ]

Link: https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html [1]
Link: https://doc.rust-lang.org/rustc/symbol-mangling/v0.html#disambiguator [2]
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Tested-by: Benno Lossin <benno.lossin@proton.me>
Link: https://lore.kernel.org/r/20240725183325.122827-6-ojeda@kernel.org
[ Added `len_mismatch_fail` symbol for new `kernel` crate code merged
  since then as well as 3 more `core::panicking` symbols that appear
  in `RUST_DEBUG_ASSERTIONS=y` builds.  - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 tools/objtool/check.c     | 52 ++++++++++++++++++++++++++++++++++++++++++++++-
 tools/objtool/noreturns.h |  2 ++
 2 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 01237d167223..d086f207a3d3 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -177,6 +177,52 @@ static bool is_sibling_call(struct instruction *insn)
 	return (is_static_jump(insn) && insn_call_dest(insn));
 }
 
+/*
+ * Checks if a string ends with another.
+ */
+static bool str_ends_with(const char *s, const char *sub)
+{
+	const int slen = strlen(s);
+	const int sublen = strlen(sub);
+
+	if (sublen > slen)
+		return 0;
+
+	return !memcmp(s + slen - sublen, sub, sublen);
+}
+
+/*
+ * Checks if a function is a Rust "noreturn" one.
+ */
+static bool is_rust_noreturn(const struct symbol *func)
+{
+	/*
+	 * If it does not start with "_R", then it is not a Rust symbol.
+	 */
+	if (strncmp(func->name, "_R", 2))
+		return false;
+
+	/*
+	 * These are just heuristics -- we do not control the precise symbol
+	 * name, due to the crate disambiguators (which depend on the compiler)
+	 * as well as changes to the source code itself between versions (since
+	 * these come from the Rust standard library).
+	 */
+	return str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail")		||
+	       str_ends_with(func->name, "_4core6option13unwrap_failed")				||
+	       str_ends_with(func->name, "_4core6result13unwrap_failed")				||
+	       str_ends_with(func->name, "_4core9panicking5panic")					||
+	       str_ends_with(func->name, "_4core9panicking9panic_fmt")					||
+	       str_ends_with(func->name, "_4core9panicking14panic_explicit")				||
+	       str_ends_with(func->name, "_4core9panicking14panic_nounwind")				||
+	       str_ends_with(func->name, "_4core9panicking18panic_bounds_check")			||
+	       str_ends_with(func->name, "_4core9panicking19assert_failed_inner")			||
+	       str_ends_with(func->name, "_4core9panicking36panic_misaligned_pointer_dereference")	||
+	       strstr(func->name, "_4core9panicking11panic_const24panic_const_")			||
+	       (strstr(func->name, "_4core5slice5index24slice_") &&
+		str_ends_with(func->name, "_fail"));
+}
+
 /*
  * This checks to see if the given function is a "noreturn" function.
  *
@@ -202,10 +248,14 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 	if (!func)
 		return false;
 
-	if (func->bind == STB_GLOBAL || func->bind == STB_WEAK)
+	if (func->bind == STB_GLOBAL || func->bind == STB_WEAK) {
+		if (is_rust_noreturn(func))
+			return true;
+
 		for (i = 0; i < ARRAY_SIZE(global_noreturns); i++)
 			if (!strcmp(func->name, global_noreturns[i]))
 				return true;
+	}
 
 	if (func->bind == STB_WEAK)
 		return false;
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 1e8141ef1b15..e7da92489167 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -39,6 +39,8 @@ NORETURN(panic)
 NORETURN(panic_smp_self_stop)
 NORETURN(rest_init)
 NORETURN(rewind_stack_and_make_dead)
+NORETURN(rust_begin_unwind)
+NORETURN(rust_helper_BUG)
 NORETURN(sev_es_terminate)
 NORETURN(snp_abort)
 NORETURN(start_kernel)
-- 
cgit v1.2.3


From d0fe8920cbe42547798fd806f078eeaaba05df18 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 24 Jul 2024 15:15:36 +0200
Subject: selftests: add F_CREATED_QUERY tests

Add simple selftests for fcntl(fd, F_CREATED_QUERY, 0).

Link: https://lore.kernel.org/r/20240724-work-fcntl-v1-2-e8153a2f1991@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/core/close_range_test.c | 39 +++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index 12b4eb9d0434..e0d9851fe1c9 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -26,6 +26,10 @@
 #define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3)
 #endif
 
+#ifndef F_CREATED_QUERY
+#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4)
+#endif
+
 static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
 				  unsigned int flags)
 {
@@ -624,4 +628,39 @@ TEST(close_range_bitmap_corruption)
 	EXPECT_EQ(0, WEXITSTATUS(status));
 }
 
+TEST(fcntl_created)
+{
+	for (int i = 0; i < 101; i++) {
+		int fd;
+		char path[PATH_MAX];
+
+		fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+		ASSERT_GE(fd, 0) {
+			if (errno == ENOENT)
+				SKIP(return,
+					   "Skipping test since /dev/null does not exist");
+		}
+
+		/* We didn't create "/dev/null". */
+		EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0);
+		close(fd);
+
+		sprintf(path, "aaaa_%d", i);
+		fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0600);
+		ASSERT_GE(fd, 0);
+
+		/* We created "aaaa_%d". */
+		EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 1);
+		close(fd);
+
+		fd = open(path, O_RDONLY | O_CLOEXEC);
+		ASSERT_GE(fd, 0);
+
+		/* We're opening it again, so no positive creation check. */
+		EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0);
+		close(fd);
+		unlink(path);
+	}
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From e8bb03ed6850c6ed4ce2f1600ea73401fc2ebd95 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:31 -0700
Subject: perf dwarf-aux: Check allowed location expressions when collecting
 variables

It missed to call check_allowed_ops() in __die_collect_vars_cb() so it
can take variables with complex location expression incorrectly.

For example, I found some variable has this expression.

    015d8df8 ffffffff81aacfb3 (base address)
    015d8e01 v000000000000004 v000000000000000 views at 015d8df2 for:
             ffffffff81aacfb3 ffffffff81aacfd2 (DW_OP_fbreg: -176; DW_OP_deref;
						DW_OP_plus_uconst: 332; DW_OP_deref_size: 4;
						DW_OP_lit1; DW_OP_shra; DW_OP_const1u: 64;
						DW_OP_minus; DW_OP_stack_value)
    015d8e14 v000000000000000 v000000000000000 views at 015d8df4 for:
             ffffffff81aacfd2 ffffffff81aacfd7 (DW_OP_reg3 (rbx))
    015d8e19 v000000000000000 v000000000000000 views at 015d8df6 for:
             ffffffff81aacfd7 ffffffff81aad020 (DW_OP_fbreg: -176; DW_OP_deref;
						DW_OP_plus_uconst: 332; DW_OP_deref_size: 4;
						DW_OP_lit1; DW_OP_shra; DW_OP_const1u: 64;
						DW_OP_minus; DW_OP_stack_value)
    015d8e2c <End of list>

It looks like '((int *)(-176(%rbp) + 332) >> 1) - 64' but the current
code thought it's just -176(%rbp) and processed the variable incorrectly.
It should reject such a complex expression if check_allowed_ops()
doesn't like it. :)

Fixes: 932dcc2c39aedf54 ("perf dwarf-aux: Add die_collect_vars()")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-aux.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 5e080d7e22c2..beb632153a74 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1598,6 +1598,9 @@ static int __die_collect_vars_cb(Dwarf_Die *die_mem, void *arg)
 	if (dwarf_getlocations(&attr, 0, &base, &start, &end, &ops, &nops) <= 0)
 		return DIE_FIND_CB_SIBLING;
 
+	if (!check_allowed_ops(ops, nops))
+		return DIE_FIND_CB_SIBLING;
+
 	if (die_get_real_type(die_mem, &type_die) == NULL)
 		return DIE_FIND_CB_SIBLING;
 
-- 
cgit v1.2.3


From 3ab0b8b238b5130ae3fa37ddaa329fc0e93b6b9a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:32 -0700
Subject: perf annotate-data: Fix off-by-one in location range check

The location list will have entries with half-open addressing like
[start, end) which means it doesn't include the end address.  So it
should skip entries at the end address and match to the next entry.

An example location list looks like this (from readelf -wo):

    00237876 ffffffff8110d32b (base address)
    0023787f v000000000000000 v000000000000002 views at 00237868 for:
             ffffffff8110d32b ffffffff8110d4eb (DW_OP_reg3 (rbx))     <<<--- 1
    00237885 v000000000000002 v000000000000000 views at 0023786a for:
             ffffffff8110d4eb ffffffff8110d50b (DW_OP_reg14 (r14))    <<<--- 2
    0023788c v000000000000000 v000000000000001 views at 0023786c for:
             ffffffff8110d50b ffffffff8110d7c4 (DW_OP_reg3 (rbx))
    00237893 v000000000000000 v000000000000000 views at 0023786e for:
             ffffffff8110d806 ffffffff8110d854 (DW_OP_reg3 (rbx))
    0023789a v000000000000000 v000000000000000 views at 00237870 for:
             ffffffff8110d876 ffffffff8110d88e (DW_OP_reg3 (rbx))

The first entry at 0023787f has [8110d32b, 8110d4eb) (omitting the
ffffffff at the beginning), and the second one has [8110d4eb, 8110d50b).

Fixes: 2bc3cf575a162a2c ("perf annotate-data: Improve debug message with location info")
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 2 +-
 tools/perf/util/dwarf-aux.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index ff85d190e3ac..fd8d3cdead5a 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -95,7 +95,7 @@ static void pr_debug_location(Dwarf_Die *die, u64 pc, int reg)
 		return;
 
 	while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) {
-		if (reg != DWARF_REG_PC && end < pc)
+		if (reg != DWARF_REG_PC && end <= pc)
 			continue;
 		if (reg != DWARF_REG_PC && start > pc)
 			break;
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index beb632153a74..0151a8d14350 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1444,7 +1444,7 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg)
 
 	while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) {
 		/* Assuming the location list is sorted by address */
-		if (end < data->pc)
+		if (end <= data->pc)
 			continue;
 		if (start > data->pc)
 			break;
-- 
cgit v1.2.3


From 976862f8abefafc42a27fff4c0e5d56cfc8d8ef4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:33 -0700
Subject: perf annotate-data: Add 'enum type_match_result'

And let check_variable() return the enum value so that callers can know
what was the problem.  This will be used by the later patch to update
the statistics correctly and print the error message in a right place.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index fd8d3cdead5a..8e3b422eca22 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -345,9 +345,20 @@ static bool find_cu_die(struct debuginfo *di, u64 pc, Dwarf_Die *cu_die)
 	return false;
 }
 
+enum type_match_result {
+	PERF_TMR_UNKNOWN = 0,
+	PERF_TMR_OK,
+	PERF_TMR_NO_TYPE,
+	PERF_TMR_NO_POINTER,
+	PERF_TMR_NO_SIZE,
+	PERF_TMR_BAD_OFFSET,
+};
+
 /* The type info will be saved in @type_die */
-static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
-			  Dwarf_Die *type_die, int reg, int offset, bool is_fbreg)
+static enum type_match_result check_variable(struct data_loc_info *dloc,
+					     Dwarf_Die *var_die,
+					     Dwarf_Die *type_die, int reg,
+					     int offset, bool is_fbreg)
 {
 	Dwarf_Word size;
 	bool is_pointer = true;
@@ -364,7 +375,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
 	if (__die_get_real_type(var_die, type_die) == NULL) {
 		pr_debug_dtp("variable has no type\n");
 		ann_data_stat.no_typeinfo++;
-		return -1;
+		return PERF_TMR_NO_TYPE;
 	}
 
 	/*
@@ -378,7 +389,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
 		    __die_get_real_type(type_die, type_die) == NULL) {
 			pr_debug_dtp("no pointer or no type\n");
 			ann_data_stat.no_typeinfo++;
-			return -1;
+			return PERF_TMR_NO_POINTER;
 		}
 	}
 
@@ -391,7 +402,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
 	if (dwarf_aggregate_size(&sized_type, &size) < 0) {
 		pr_debug_dtp("type size is unknown\n");
 		ann_data_stat.invalid_size++;
-		return -1;
+		return PERF_TMR_NO_SIZE;
 	}
 
 	/* Minimal sanity check */
@@ -399,10 +410,10 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die,
 		pr_debug_dtp("offset: %d is bigger than size: %"PRIu64"\n",
 			     offset, size);
 		ann_data_stat.bad_offset++;
-		return -1;
+		return PERF_TMR_BAD_OFFSET;
 	}
 
-	return 0;
+	return PERF_TMR_OK;
 }
 
 struct type_state_stack *find_stack_state(struct type_state *state,
@@ -652,7 +663,7 @@ bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
 	/* Try to get the variable by address first */
 	if (die_find_variable_by_addr(cu_die, var_addr, &var_die, &offset) &&
 	    check_variable(dloc, &var_die, type_die, DWARF_REG_PC, offset,
-			   /*is_fbreg=*/false) == 0) {
+			   /*is_fbreg=*/false) == PERF_TMR_OK) {
 		var_name = dwarf_diename(&var_die);
 		*var_offset = offset;
 		goto ok;
@@ -666,7 +677,7 @@ bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
 	/* Try to get the name of global variable */
 	if (die_find_variable_at(cu_die, var_name, pc, &var_die) &&
 	    check_variable(dloc, &var_die, type_die, DWARF_REG_PC, *var_offset,
-			   /*is_fbreg=*/false) == 0)
+			   /*is_fbreg=*/false) == PERF_TMR_OK)
 		goto ok;
 
 	return false;
@@ -1205,6 +1216,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
 	bool is_fbreg = false;
 	u64 pc;
 	char buf[64];
+	enum type_match_result result;
 
 	if (dloc->op->multi_regs)
 		snprintf(buf, sizeof(buf), "reg%d, reg%d", dloc->op->reg1, dloc->op->reg2);
@@ -1299,8 +1311,8 @@ retry:
 		}
 
 		/* Found a variable, see if it's correct */
-		ret = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg);
-		if (ret == 0) {
+		result = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg);
+		if (result == PERF_TMR_OK) {
 			pr_debug_dtp("found \"%s\" in scope=%d/%d (die: %#lx) ",
 				     dwarf_diename(&var_die), i+1, nr_scopes,
 				     (long)dwarf_dieoffset(&scopes[i]));
@@ -1315,12 +1327,14 @@ retry:
 			}
 			pr_debug_location(&var_die, pc, reg);
 			pr_debug_type_name(type_die, TSR_KIND_TYPE);
+			ret = 0;
 		} else {
 			pr_debug_dtp("check variable \"%s\" failed (die: %#lx)\n",
 				     dwarf_diename(&var_die),
 				     (long)dwarf_dieoffset(&var_die));
 			pr_debug_location(&var_die, pc, reg);
 			pr_debug_type_name(type_die, TSR_KIND_TYPE);
+			ret = -1;
 		}
 		dloc->type_offset = offset;
 		goto out;
-- 
cgit v1.2.3


From 653185d808ea6f5e3e6247087b793fcd8510bb26 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:34 -0700
Subject: perf annotate-data: Add variable_state_str()

So that it can show a proper debug message in the right place.  The
check_variable() is used in other places which don't want to print the
message.

  $ perf --debug type-profile annotate --data-type

Before:
  -----------------------------------------------------------
  find data type for 0x140(reg14) at update_blocked_averages+0x2db
  CU for kernel/sched/fair.c (die:0x12dd892)
  frame base: cfa=1 fbreg=7
  no pointer or no type                                         <<<--- removed
  check variable "__mptr" failed (die: 0x13022f1)
   variable location: base=reg14, offset=0x140
   type='void*' size=0x8 (die:0x12dd8f9)

After:
  -----------------------------------------------------------
  find data type for 0x140(reg14) at update_blocked_averages+0x2db
  CU for kernel/sched/fair.c (die:0x12dd892)
  frame base: cfa=1 fbreg=7
  found "__mptr" (die: 0x13022f1) in scope=4/4 (die: 0x13022e8) failed: no/void pointer  <<<--- here
   variable location: base=reg14, offset=0x140
   type='void*' size=0x8 (die:0x12dd8f9)

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-5-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 8e3b422eca22..332254da49be 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -354,6 +354,25 @@ enum type_match_result {
 	PERF_TMR_BAD_OFFSET,
 };
 
+static const char *match_result_str(enum type_match_result tmr)
+{
+	switch (tmr) {
+	case PERF_TMR_OK:
+		return "Good!";
+	case PERF_TMR_NO_TYPE:
+		return "no type information";
+	case PERF_TMR_NO_POINTER:
+		return "no/void pointer";
+	case PERF_TMR_NO_SIZE:
+		return "type size is unknown";
+	case PERF_TMR_BAD_OFFSET:
+		return "offset bigger than size";
+	case PERF_TMR_UNKNOWN:
+	default:
+		return "invalid state";
+	}
+}
+
 /* The type info will be saved in @type_die */
 static enum type_match_result check_variable(struct data_loc_info *dloc,
 					     Dwarf_Die *var_die,
@@ -373,7 +392,6 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 
 	/* Get the type of the variable */
 	if (__die_get_real_type(var_die, type_die) == NULL) {
-		pr_debug_dtp("variable has no type\n");
 		ann_data_stat.no_typeinfo++;
 		return PERF_TMR_NO_TYPE;
 	}
@@ -387,7 +405,6 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 		if ((dwarf_tag(type_die) != DW_TAG_pointer_type &&
 		     dwarf_tag(type_die) != DW_TAG_array_type) ||
 		    __die_get_real_type(type_die, type_die) == NULL) {
-			pr_debug_dtp("no pointer or no type\n");
 			ann_data_stat.no_typeinfo++;
 			return PERF_TMR_NO_POINTER;
 		}
@@ -400,15 +417,12 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 
 	/* Get the size of the actual type */
 	if (dwarf_aggregate_size(&sized_type, &size) < 0) {
-		pr_debug_dtp("type size is unknown\n");
 		ann_data_stat.invalid_size++;
 		return PERF_TMR_NO_SIZE;
 	}
 
 	/* Minimal sanity check */
 	if ((unsigned)offset >= size) {
-		pr_debug_dtp("offset: %d is bigger than size: %"PRIu64"\n",
-			     offset, size);
 		ann_data_stat.bad_offset++;
 		return PERF_TMR_BAD_OFFSET;
 	}
@@ -1310,12 +1324,13 @@ retry:
 				continue;
 		}
 
+		pr_debug_dtp("found \"%s\" (die: %#lx) in scope=%d/%d (die: %#lx) ",
+			     dwarf_diename(&var_die), (long)dwarf_dieoffset(&var_die),
+			     i+1, nr_scopes, (long)dwarf_dieoffset(&scopes[i]));
+
 		/* Found a variable, see if it's correct */
 		result = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg);
 		if (result == PERF_TMR_OK) {
-			pr_debug_dtp("found \"%s\" in scope=%d/%d (die: %#lx) ",
-				     dwarf_diename(&var_die), i+1, nr_scopes,
-				     (long)dwarf_dieoffset(&scopes[i]));
 			if (reg == DWARF_REG_PC) {
 				pr_debug_dtp("addr=%#"PRIx64" type_offset=%#x\n",
 					     dloc->var_addr, offset);
@@ -1325,17 +1340,13 @@ retry:
 			} else {
 				pr_debug_dtp("type_offset=%#x\n", offset);
 			}
-			pr_debug_location(&var_die, pc, reg);
-			pr_debug_type_name(type_die, TSR_KIND_TYPE);
 			ret = 0;
 		} else {
-			pr_debug_dtp("check variable \"%s\" failed (die: %#lx)\n",
-				     dwarf_diename(&var_die),
-				     (long)dwarf_dieoffset(&var_die));
-			pr_debug_location(&var_die, pc, reg);
-			pr_debug_type_name(type_die, TSR_KIND_TYPE);
+			pr_debug_dtp("failed: %s\n", match_result_str(result));
 			ret = -1;
 		}
+		pr_debug_location(&var_die, pc, reg);
+		pr_debug_type_name(type_die, TSR_KIND_TYPE);
 		dloc->type_offset = offset;
 		goto out;
 	}
-- 
cgit v1.2.3


From 69e2c78425c92361f0ab01cbbb8f3a44fc94341f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:35 -0700
Subject: perf annotate-data: Change return type of find_data_type_block()

So that it can return enum variable_match_type to be propagated to the
find_data_type_die().  Also update the debug message to show the result
of the check_matching_type().

  chk [dd] reg0 offset=0 ok=1 kind=1  : Good!
or
  chk [177] reg4 offset=0x138 ok=0 kind=0 cfa : no type information

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 117 ++++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 59 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 332254da49be..5548cd8e84ba 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -352,6 +352,7 @@ enum type_match_result {
 	PERF_TMR_NO_POINTER,
 	PERF_TMR_NO_SIZE,
 	PERF_TMR_BAD_OFFSET,
+	PERF_TMR_BAIL_OUT,
 };
 
 static const char *match_result_str(enum type_match_result tmr)
@@ -368,6 +369,7 @@ static const char *match_result_str(enum type_match_result tmr)
 	case PERF_TMR_BAD_OFFSET:
 		return "offset bigger than size";
 	case PERF_TMR_UNKNOWN:
+	case PERF_TMR_BAIL_OUT:
 	default:
 		return "invalid state";
 	}
@@ -868,19 +870,19 @@ static void setup_stack_canary(struct data_loc_info *dloc)
 
 /*
  * It's at the target address, check if it has a matching type.
- * It returns 1 if found, 0 if not or -1 if not found but no need to
- * repeat the search.  The last case is for per-cpu variables which
+ * It returns PERF_TMR_BAIL_OUT when it looks up per-cpu variables which
  * are similar to global variables and no additional info is needed.
  */
-static int check_matching_type(struct type_state *state,
-			       struct data_loc_info *dloc,
-			       Dwarf_Die *cu_die, Dwarf_Die *type_die)
+static enum type_match_result check_matching_type(struct type_state *state,
+						  struct data_loc_info *dloc,
+						  Dwarf_Die *cu_die,
+						  Dwarf_Die *type_die)
 {
 	Dwarf_Word size;
 	u32 insn_offset = dloc->ip - dloc->ms->sym->start;
 	int reg = dloc->op->reg1;
 
-	pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d",
+	pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d ",
 		     insn_offset, reg, dloc->op->offset,
 		     state->regs[reg].ok, state->regs[reg].kind);
 
@@ -896,15 +898,12 @@ static int check_matching_type(struct type_state *state,
 			if (dloc->op->offset < 0 && reg != state->stack_reg)
 				goto check_kernel;
 
-			pr_debug_dtp("\n");
-			return -1;
+			return PERF_TMR_NO_POINTER;
 		}
 
-		pr_debug_dtp("\n");
-
 		/* Remove the pointer and get the target type */
 		if (__die_get_real_type(&state->regs[reg].type, type_die) == NULL)
-			return -1;
+			return PERF_TMR_NO_POINTER;
 
 		dloc->type_offset = dloc->op->offset;
 
@@ -916,33 +915,33 @@ static int check_matching_type(struct type_state *state,
 		/* Get the size of the actual type */
 		if (dwarf_aggregate_size(&sized_type, &size) < 0 ||
 		    (unsigned)dloc->type_offset >= size)
-			return -1;
+			return PERF_TMR_BAD_OFFSET;
 
-		return 1;
+		return PERF_TMR_OK;
 	}
 
 	if (reg == dloc->fbreg) {
 		struct type_state_stack *stack;
 
-		pr_debug_dtp(" fbreg\n");
+		pr_debug_dtp("fbreg");
 
 		stack = find_stack_state(state, dloc->type_offset);
 		if (stack == NULL)
-			return 0;
+			return PERF_TMR_NO_TYPE;
 
 		if (stack->kind == TSR_KIND_CANARY) {
 			setup_stack_canary(dloc);
-			return -1;
+			return PERF_TMR_BAIL_OUT;
 		}
 
 		if (stack->kind != TSR_KIND_TYPE)
-			return 0;
+			return PERF_TMR_NO_TYPE;
 
 		*type_die = stack->type;
 		/* Update the type offset from the start of slot */
 		dloc->type_offset -= stack->offset;
 
-		return 1;
+		return PERF_TMR_OK;
 	}
 
 	if (dloc->fb_cfa) {
@@ -950,38 +949,38 @@ static int check_matching_type(struct type_state *state,
 		u64 pc = map__rip_2objdump(dloc->ms->map, dloc->ip);
 		int fbreg, fboff;
 
-		pr_debug_dtp(" cfa\n");
+		pr_debug_dtp("cfa");
 
 		if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0)
 			fbreg = -1;
 
 		if (reg != fbreg)
-			return 0;
+			return PERF_TMR_NO_TYPE;
 
 		stack = find_stack_state(state, dloc->type_offset - fboff);
 		if (stack == NULL)
-			return 0;
+			return PERF_TMR_NO_TYPE;
 
 		if (stack->kind == TSR_KIND_CANARY) {
 			setup_stack_canary(dloc);
-			return -1;
+			return PERF_TMR_BAIL_OUT;
 		}
 
 		if (stack->kind != TSR_KIND_TYPE)
-			return 0;
+			return PERF_TMR_NO_TYPE;
 
 		*type_die = stack->type;
 		/* Update the type offset from the start of slot */
 		dloc->type_offset -= fboff + stack->offset;
 
-		return 1;
+		return PERF_TMR_OK;
 	}
 
 	if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) {
 		u64 var_addr = dloc->op->offset;
 		int var_offset;
 
-		pr_debug_dtp(" percpu var\n");
+		pr_debug_dtp("percpu var");
 
 		if (dloc->op->multi_regs) {
 			int reg2 = dloc->op->reg2;
@@ -997,14 +996,14 @@ static int check_matching_type(struct type_state *state,
 		if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
 					&var_offset, type_die)) {
 			dloc->type_offset = var_offset;
-			return 1;
+			return PERF_TMR_OK;
 		}
 		/* No need to retry per-cpu (global) variables */
-		return -1;
+		return PERF_TMR_BAIL_OUT;
 	}
 
 	if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_POINTER) {
-		pr_debug_dtp(" percpu ptr\n");
+		pr_debug_dtp("percpu ptr");
 
 		/*
 		 * It's actaully pointer but the address was calculated using
@@ -1017,13 +1016,13 @@ static int check_matching_type(struct type_state *state,
 		/* Get the size of the actual type */
 		if (dwarf_aggregate_size(type_die, &size) < 0 ||
 		    (unsigned)dloc->type_offset >= size)
-			return -1;
+			return PERF_TMR_BAIL_OUT;
 
-		return 1;
+		return PERF_TMR_OK;
 	}
 
 	if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_CANARY) {
-		pr_debug_dtp(" stack canary\n");
+		pr_debug_dtp("stack canary");
 
 		/*
 		 * This is a saved value of the stack canary which will be handled
@@ -1032,7 +1031,7 @@ static int check_matching_type(struct type_state *state,
 		 */
 		setup_stack_canary(dloc);
 
-		return -1;
+		return PERF_TMR_BAIL_OUT;
 	}
 
 check_kernel:
@@ -1043,16 +1042,16 @@ check_kernel:
 		/* Direct this-cpu access like "%gs:0x34740" */
 		if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm &&
 		    arch__is(dloc->arch, "x86")) {
-			pr_debug_dtp(" this-cpu var\n");
+			pr_debug_dtp("this-cpu var");
 
 			addr = dloc->op->offset;
 
 			if (get_global_var_type(cu_die, dloc, dloc->ip, addr,
 						&offset, type_die)) {
 				dloc->type_offset = offset;
-				return 1;
+				return PERF_TMR_OK;
 			}
-			return -1;
+			return PERF_TMR_BAIL_OUT;
 		}
 
 		/* Access to global variable like "-0x7dcf0500(,%rdx,8)" */
@@ -1061,31 +1060,30 @@ check_kernel:
 
 			if (get_global_var_type(cu_die, dloc, dloc->ip, addr,
 						&offset, type_die)) {
-				pr_debug_dtp(" global var\n");
+				pr_debug_dtp("global var");
 
 				dloc->type_offset = offset;
-				return 1;
+				return PERF_TMR_OK;
 			}
-			pr_debug_dtp(" negative offset\n");
-			return -1;
+			return PERF_TMR_BAIL_OUT;
 		}
 	}
 
-	pr_debug_dtp("\n");
-	return 0;
+	return PERF_TMR_UNKNOWN;
 }
 
 /* Iterate instructions in basic blocks and update type table */
-static int find_data_type_insn(struct data_loc_info *dloc,
-			       struct list_head *basic_blocks,
-			       struct die_var_type *var_types,
-			       Dwarf_Die *cu_die, Dwarf_Die *type_die)
+static enum type_match_result find_data_type_insn(struct data_loc_info *dloc,
+						  struct list_head *basic_blocks,
+						  struct die_var_type *var_types,
+						  Dwarf_Die *cu_die,
+						  Dwarf_Die *type_die)
 {
 	struct type_state state;
 	struct symbol *sym = dloc->ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
 	struct annotated_basic_block *bb;
-	int ret = 0;
+	enum type_match_result ret = PERF_TMR_UNKNOWN;
 
 	init_type_state(&state, dloc->arch);
 
@@ -1111,6 +1109,7 @@ static int find_data_type_insn(struct data_loc_info *dloc,
 			if (this_ip == dloc->ip) {
 				ret = check_matching_type(&state, dloc,
 							  cu_die, type_die);
+				pr_debug_dtp(" : %s\n", match_result_str(ret));
 				goto out;
 			}
 
@@ -1137,24 +1136,25 @@ static int arch_supports_insn_tracking(struct data_loc_info *dloc)
  * Construct a list of basic blocks for each scope with variables and try to find
  * the data type by updating a type state table through instructions.
  */
-static int find_data_type_block(struct data_loc_info *dloc,
-				Dwarf_Die *cu_die, Dwarf_Die *scopes,
-				int nr_scopes, Dwarf_Die *type_die)
+static enum type_match_result find_data_type_block(struct data_loc_info *dloc,
+						   Dwarf_Die *cu_die,
+						   Dwarf_Die *scopes,
+						   int nr_scopes,
+						   Dwarf_Die *type_die)
 {
 	LIST_HEAD(basic_blocks);
 	struct die_var_type *var_types = NULL;
 	u64 src_ip, dst_ip, prev_dst_ip;
-	int ret = -1;
+	enum type_match_result ret = PERF_TMR_UNKNOWN;
 
 	/* TODO: other architecture support */
 	if (!arch_supports_insn_tracking(dloc))
-		return -1;
+		return PERF_TMR_BAIL_OUT;
 
 	prev_dst_ip = dst_ip = dloc->ip;
 	for (int i = nr_scopes - 1; i >= 0; i--) {
 		Dwarf_Addr base, start, end;
 		LIST_HEAD(this_blocks);
-		int found;
 
 		if (dwarf_ranges(&scopes[i], 0, &base, &start, &end) < 0)
 			break;
@@ -1185,9 +1185,9 @@ again:
 		fixup_var_address(var_types, start);
 
 		/* Find from start of this scope to the target instruction */
-		found = find_data_type_insn(dloc, &basic_blocks, var_types,
+		ret = find_data_type_insn(dloc, &basic_blocks, var_types,
 					    cu_die, type_die);
-		if (found > 0) {
+		if (ret == PERF_TMR_OK) {
 			char buf[64];
 
 			if (dloc->op->multi_regs)
@@ -1199,11 +1199,10 @@ again:
 			pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
 				     dloc->op->offset, buf, dloc->type_offset);
 			pr_debug_type_name(type_die, TSR_KIND_TYPE);
-			ret = 0;
 			break;
 		}
 
-		if (found < 0)
+		if (ret == PERF_TMR_BAIL_OUT)
 			break;
 
 		/* Go up to the next scope and find blocks to the start */
@@ -1357,11 +1356,11 @@ retry:
 	}
 
 	if (reg != DWARF_REG_PC) {
-		ret = find_data_type_block(dloc, &cu_die, scopes,
+		result = find_data_type_block(dloc, &cu_die, scopes,
 					   nr_scopes, type_die);
-		if (ret == 0) {
+		if (result == PERF_TMR_OK) {
 			ann_data_stat.insn_track++;
-			goto out;
+			ret = 0;
 		}
 	}
 
-- 
cgit v1.2.3


From 98d1f1dc72fd6e0e85db4acc0b2dd591f4488216 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:36 -0700
Subject: perf annotate-data: Add is_pointer_type() helper

It treats pointers and arrays in the same way.  Let's add the helper and
use it when it checks if it needs a pointer.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-7-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 5548cd8e84ba..b2414747cdeb 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -375,6 +375,13 @@ static const char *match_result_str(enum type_match_result tmr)
 	}
 }
 
+static bool is_pointer_type(Dwarf_Die *type_die)
+{
+	int tag = dwarf_tag(type_die);
+
+	return tag == DW_TAG_pointer_type || tag == DW_TAG_array_type;
+}
+
 /* The type info will be saved in @type_die */
 static enum type_match_result check_variable(struct data_loc_info *dloc,
 					     Dwarf_Die *var_die,
@@ -382,15 +389,15 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 					     int offset, bool is_fbreg)
 {
 	Dwarf_Word size;
-	bool is_pointer = true;
+	bool needs_pointer = true;
 	Dwarf_Die sized_type;
 
 	if (reg == DWARF_REG_PC)
-		is_pointer = false;
+		needs_pointer = false;
 	else if (reg == dloc->fbreg || is_fbreg)
-		is_pointer = false;
+		needs_pointer = false;
 	else if (arch__is(dloc->arch, "x86") && reg == X86_REG_SP)
-		is_pointer = false;
+		needs_pointer = false;
 
 	/* Get the type of the variable */
 	if (__die_get_real_type(var_die, type_die) == NULL) {
@@ -403,9 +410,8 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 	 * Convert to a real type it points to.  But global variables
 	 * and local variables are accessed directly without a pointer.
 	 */
-	if (is_pointer) {
-		if ((dwarf_tag(type_die) != DW_TAG_pointer_type &&
-		     dwarf_tag(type_die) != DW_TAG_array_type) ||
+	if (needs_pointer) {
+		if (!is_pointer_type(type_die) ||
 		    __die_get_real_type(type_die, type_die) == NULL) {
 			ann_data_stat.no_typeinfo++;
 			return PERF_TMR_NO_POINTER;
@@ -887,14 +893,13 @@ static enum type_match_result check_matching_type(struct type_state *state,
 		     state->regs[reg].ok, state->regs[reg].kind);
 
 	if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) {
-		int tag = dwarf_tag(&state->regs[reg].type);
 		Dwarf_Die sized_type;
 
 		/*
 		 * Normal registers should hold a pointer (or array) to
 		 * dereference a memory location.
 		 */
-		if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type) {
+		if (!is_pointer_type(&state->regs[reg].type)) {
 			if (dloc->op->offset < 0 && reg != state->stack_reg)
 				goto check_kernel;
 
-- 
cgit v1.2.3


From c663451f9239c89746b55f1a17ad62b014fa7905 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:37 -0700
Subject: perf annotate-data: Add is_better_type() helper

Sometimes more than one variables are located in the same register or a
stack slot.  Or it can overwrite existing information with others.  I
found this is not helpful in some cases so it needs to update the type
information from the variable only if it's better.

But it's hard to know which one is better, so we needs heuristics. :)

As it deals with memory accesses, the location should have a pointer or
something similar (like array or reference).  So if it had an integer
type and a variable is a pointer, we can take the variable's type to
resolve the target of the access.

If it has a pointer type and a variable with the same location has a
different pointer type, it'll take one with bigger target type.  This
can be useful when the target type embeds a smaller type (like list
header or RB-tree node) at the beginning so their location is same.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-8-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 61 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index b2414747cdeb..916d26bfb9eb 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -382,6 +382,38 @@ static bool is_pointer_type(Dwarf_Die *type_die)
 	return tag == DW_TAG_pointer_type || tag == DW_TAG_array_type;
 }
 
+/* returns if Type B has better information than Type A */
+static bool is_better_type(Dwarf_Die *type_a, Dwarf_Die *type_b)
+{
+	Dwarf_Word size_a, size_b;
+	Dwarf_Die die_a, die_b;
+
+	/* pointer type is preferred */
+	if (is_pointer_type(type_a) != is_pointer_type(type_b))
+		return is_pointer_type(type_b);
+
+	if (is_pointer_type(type_b)) {
+		/*
+		 * We want to compare the target type, but 'void *' can fail to
+		 * get the target type.
+		 */
+		if (die_get_real_type(type_a, &die_a) == NULL)
+			return true;
+		if (die_get_real_type(type_b, &die_b) == NULL)
+			return false;
+
+		type_a = &die_a;
+		type_b = &die_b;
+	}
+
+	/* bigger type is preferred */
+	if (dwarf_aggregate_size(type_a, &size_a) < 0 ||
+	    dwarf_aggregate_size(type_b, &size_b) < 0)
+		return false;
+
+	return size_a < size_b;
+}
+
 /* The type info will be saved in @type_die */
 static enum type_match_result check_variable(struct data_loc_info *dloc,
 					     Dwarf_Die *var_die,
@@ -741,24 +773,33 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 		if (!dwarf_offdie(dloc->di->dbg, var->die_off, &mem_die))
 			continue;
 
-		if (var->reg == DWARF_REG_FB) {
-			findnew_stack_state(state, var->offset, TSR_KIND_TYPE,
-					    &mem_die);
+		if (var->reg == DWARF_REG_FB || var->reg == fbreg) {
+			int offset = var->offset;
+			struct type_state_stack *stack;
 
-			pr_debug_dtp("var [%"PRIx64"] -%#x(stack)",
-				     insn_offset, -var->offset);
-			pr_debug_type_name(&mem_die, TSR_KIND_TYPE);
-		} else if (var->reg == fbreg) {
-			findnew_stack_state(state, var->offset - fb_offset,
-					    TSR_KIND_TYPE, &mem_die);
+			if (var->reg != DWARF_REG_FB)
+				offset -= fb_offset;
+
+			stack = find_stack_state(state, offset);
+			if (stack && stack->kind == TSR_KIND_TYPE &&
+			    !is_better_type(&stack->type, &mem_die))
+				continue;
+
+			findnew_stack_state(state, offset, TSR_KIND_TYPE,
+					    &mem_die);
 
 			pr_debug_dtp("var [%"PRIx64"] -%#x(stack)",
-				     insn_offset, -var->offset + fb_offset);
+				     insn_offset, -offset);
 			pr_debug_type_name(&mem_die, TSR_KIND_TYPE);
 		} else if (has_reg_type(state, var->reg) && var->offset == 0) {
 			struct type_state_reg *reg;
 
 			reg = &state->regs[var->reg];
+
+			if (reg->ok && reg->kind == TSR_KIND_TYPE &&
+			    !is_better_type(&reg->type, &mem_die))
+				continue;
+
 			reg->type = mem_die;
 			reg->kind = TSR_KIND_TYPE;
 			reg->ok = true;
-- 
cgit v1.2.3


From ba8833703b49451453d97bf6414a522293f7e31d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:38 -0700
Subject: perf annotate-data: Check variables in every scope

Sometimes it matches a variable in the inner scope but it fails because
the actual access can be on a different type.  Let's try variables in
every scope and choose the best one using is_better_type().

I have an example with update_blocked_averages(), at first it found a
variable (__mptr) but it's a void pointer.  So it moved on to the upper
scope and found another variable (cfs_rq).

  $ perf --debug type-profile annotate --data-type --stdio
  ...
  -----------------------------------------------------------
  find data type for 0x140(reg14) at update_blocked_averages+0x2db
  CU for kernel/sched/fair.c (die:0x12dd892)
  frame base: cfa=1 fbreg=7
  found "__mptr" (die: 0x13022f1) in scope=4/4 (die: 0x13022e8) failed: no/void pointer
   variable location: base=reg14, offset=0x140
   type='void*' size=0x8 (die:0x12dd8f9)
  found "cfs_rq" (die: 0x1301721) in scope=3/4 (die: 0x130171c) type_offset=0x140
   variable location: reg14
   type='struct cfs_rq' size=0x1c0 (die:0x12e37e5)
  final type: type='struct cfs_rq' size=0x1c0 (die:0x12e37e5)

IIUC the scope is like below:
  1: update_blocked_averages
  2:   __update_blocked_fair
  3:     for_each_leaf_cfs_rq_safe
  4:       list_entry -> (container_of)

The container_of is implemented like:

  #define container_of(ptr, type, member) ({				\
  	void *__mptr = (void *)(ptr);					\
  	static_assert(__same_type(*(ptr), ((type *)0)->member) ||	\
  		      __same_type(*(ptr), void),			\
  		      "pointer type mismatch in container_of()");	\
  	((type *)(__mptr - offsetof(type, member))); })

That's why we see the __mptr variable first but it failed since it has
no type information.

Then for_each_leaf_cfs_rq_safe() is defined as

  #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
  	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
  				 leaf_cfs_rq_list)

Note that the access was 0x140(r14).  And the cfs_rq has
leaf_cfs_rq_list at the 0x140.  So it converts the list_head pointer to
a pointer to struct cfs_rq here.

  $ pahole --hex -C cfs_rq vmlinux | grep 140
  struct cfs_rq 	struct list_head           leaf_cfs_rq_list;     /* 0x140  0x10 */

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-9-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 44 +++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 916d26bfb9eb..e86f40fed323 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1244,7 +1244,6 @@ again:
 
 			pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
 				     dloc->op->offset, buf, dloc->type_offset);
-			pr_debug_type_name(type_die, TSR_KIND_TYPE);
 			break;
 		}
 
@@ -1273,6 +1272,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
 	int fbreg = -1;
 	int fb_offset = 0;
 	bool is_fbreg = false;
+	bool found = false;
 	u64 pc;
 	char buf[64];
 	enum type_match_result result;
@@ -1358,14 +1358,17 @@ retry:
 
 	/* Search from the inner-most scope to the outer */
 	for (i = nr_scopes - 1; i >= 0; i--) {
+		Dwarf_Die mem_die;
+		int type_offset = offset;
+
 		if (reg == DWARF_REG_PC) {
 			if (!die_find_variable_by_addr(&scopes[i], dloc->var_addr,
-						       &var_die, &offset))
+						       &var_die, &type_offset))
 				continue;
 		} else {
 			/* Look up variables/parameters in this scope */
 			if (!die_find_variable_by_reg(&scopes[i], pc, reg,
-						      &offset, is_fbreg, &var_die))
+						      &type_offset, is_fbreg, &var_die))
 				continue;
 		}
 
@@ -1374,43 +1377,50 @@ retry:
 			     i+1, nr_scopes, (long)dwarf_dieoffset(&scopes[i]));
 
 		/* Found a variable, see if it's correct */
-		result = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg);
+		result = check_variable(dloc, &var_die, &mem_die, reg, type_offset, is_fbreg);
 		if (result == PERF_TMR_OK) {
 			if (reg == DWARF_REG_PC) {
 				pr_debug_dtp("addr=%#"PRIx64" type_offset=%#x\n",
-					     dloc->var_addr, offset);
+					     dloc->var_addr, type_offset);
 			} else if (reg == DWARF_REG_FB || is_fbreg) {
 				pr_debug_dtp("stack_offset=%#x type_offset=%#x\n",
-					     fb_offset, offset);
+					     fb_offset, type_offset);
 			} else {
-				pr_debug_dtp("type_offset=%#x\n", offset);
+				pr_debug_dtp("type_offset=%#x\n", type_offset);
+			}
+
+			if (!found || is_better_type(type_die, &mem_die)) {
+				*type_die = mem_die;
+				dloc->type_offset = type_offset;
+				found = true;
 			}
-			ret = 0;
 		} else {
 			pr_debug_dtp("failed: %s\n", match_result_str(result));
-			ret = -1;
 		}
+
 		pr_debug_location(&var_die, pc, reg);
-		pr_debug_type_name(type_die, TSR_KIND_TYPE);
-		dloc->type_offset = offset;
-		goto out;
+		pr_debug_type_name(&mem_die, TSR_KIND_TYPE);
 	}
 
-	if (loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) {
+	if (!found && loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) {
 		reg = loc->reg2;
 		goto retry;
 	}
 
-	if (reg != DWARF_REG_PC) {
+	if (!found && reg != DWARF_REG_PC) {
 		result = find_data_type_block(dloc, &cu_die, scopes,
-					   nr_scopes, type_die);
+					      nr_scopes, type_die);
 		if (result == PERF_TMR_OK) {
 			ann_data_stat.insn_track++;
-			ret = 0;
+			found = true;
 		}
 	}
 
-	if (ret < 0) {
+	if (found) {
+		pr_debug_dtp("final type:");
+		pr_debug_type_name(type_die, TSR_KIND_TYPE);
+		ret = 0;
+	} else {
 		pr_debug_dtp("no variable found\n");
 		ann_data_stat.no_var++;
 	}
-- 
cgit v1.2.3


From 023aceecc74ae7dd9e91c8e35d641da74a371078 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 16 Aug 2024 16:58:39 -0700
Subject: perf annotate-data: Update type stat at the end of
 find_data_type_die()

After trying all possibilities with DWARF and instruction tracking.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240816235840.2754937-10-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 47 ++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index e86f40fed323..aa330c7d8edd 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -432,10 +432,8 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 		needs_pointer = false;
 
 	/* Get the type of the variable */
-	if (__die_get_real_type(var_die, type_die) == NULL) {
-		ann_data_stat.no_typeinfo++;
+	if (__die_get_real_type(var_die, type_die) == NULL)
 		return PERF_TMR_NO_TYPE;
-	}
 
 	/*
 	 * Usually it expects a pointer type for a memory access.
@@ -444,10 +442,8 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 	 */
 	if (needs_pointer) {
 		if (!is_pointer_type(type_die) ||
-		    __die_get_real_type(type_die, type_die) == NULL) {
-			ann_data_stat.no_typeinfo++;
+		    __die_get_real_type(type_die, type_die) == NULL)
 			return PERF_TMR_NO_POINTER;
-		}
 	}
 
 	if (dwarf_tag(type_die) == DW_TAG_typedef)
@@ -456,16 +452,12 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 		sized_type = *type_die;
 
 	/* Get the size of the actual type */
-	if (dwarf_aggregate_size(&sized_type, &size) < 0) {
-		ann_data_stat.invalid_size++;
+	if (dwarf_aggregate_size(&sized_type, &size) < 0)
 		return PERF_TMR_NO_SIZE;
-	}
 
 	/* Minimal sanity check */
-	if ((unsigned)offset >= size) {
-		ann_data_stat.bad_offset++;
+	if ((unsigned)offset >= size)
 		return PERF_TMR_BAD_OFFSET;
-	}
 
 	return PERF_TMR_OK;
 }
@@ -1275,7 +1267,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
 	bool found = false;
 	u64 pc;
 	char buf[64];
-	enum type_match_result result;
+	enum type_match_result result = PERF_TMR_UNKNOWN;
 
 	if (dloc->op->multi_regs)
 		snprintf(buf, sizeof(buf), "reg%d, reg%d", dloc->op->reg1, dloc->op->reg2);
@@ -1317,7 +1309,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
 			pr_debug_dtp("found by addr=%#"PRIx64" type_offset=%#x\n",
 				     dloc->var_addr, offset);
 			pr_debug_type_name(type_die, TSR_KIND_TYPE);
-			ret = 0;
+			found = true;
 			goto out;
 		}
 	}
@@ -1416,16 +1408,37 @@ retry:
 		}
 	}
 
+out:
 	if (found) {
 		pr_debug_dtp("final type:");
 		pr_debug_type_name(type_die, TSR_KIND_TYPE);
 		ret = 0;
 	} else {
-		pr_debug_dtp("no variable found\n");
-		ann_data_stat.no_var++;
+		switch (result) {
+		case PERF_TMR_NO_TYPE:
+		case PERF_TMR_NO_POINTER:
+			pr_debug_dtp("%s\n", match_result_str(result));
+			ann_data_stat.no_typeinfo++;
+			break;
+		case PERF_TMR_NO_SIZE:
+			pr_debug_dtp("%s\n", match_result_str(result));
+			ann_data_stat.invalid_size++;
+			break;
+		case PERF_TMR_BAD_OFFSET:
+			pr_debug_dtp("%s\n", match_result_str(result));
+			ann_data_stat.bad_offset++;
+			break;
+		case PERF_TMR_UNKNOWN:
+		case PERF_TMR_BAIL_OUT:
+		case PERF_TMR_OK:  /* should not reach here */
+		default:
+			pr_debug_dtp("no variable found\n");
+			ann_data_stat.no_var++;
+			break;
+		}
+		ret = -1;
 	}
 
-out:
 	free(scopes);
 	return ret;
 }
-- 
cgit v1.2.3


From 4e97d521c2be094718c4c5c7c4f785e8972b4af0 Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Tue, 2 Jul 2024 13:15:36 +0200
Subject: selftests: netfilter: nft_queue.sh: sctp coverage

Test that nfqueue with and without GSO process SCTP packets correctly.

Joint work with Florian and Pablo.

Signed-off-by: Antonio Ojea <aojea@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/net/netfilter/config       |  2 +
 tools/testing/selftests/net/netfilter/nft_queue.sh | 85 +++++++++++++++++++++-
 2 files changed, 86 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config
index 63ef80ef47a4..b2dd4db45215 100644
--- a/tools/testing/selftests/net/netfilter/config
+++ b/tools/testing/selftests/net/netfilter/config
@@ -87,3 +87,5 @@ CONFIG_XFRM_USER=m
 CONFIG_XFRM_STATISTICS=y
 CONFIG_NET_PKTGEN=m
 CONFIG_TUN=m
+CONFIG_INET_DIAG=m
+CONFIG_SCTP_DIAG=m
diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
index c61d23a8c88d..f3bdeb1271eb 100755
--- a/tools/testing/selftests/net/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -25,6 +25,9 @@ cleanup()
 }
 
 checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+modprobe -q sctp
 
 trap cleanup EXIT
 
@@ -265,7 +268,6 @@ test_tcp_forward()
 
 test_tcp_localhost()
 {
-	dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT"
 	timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
 	local rpid=$!
 
@@ -375,6 +377,82 @@ EOF
 	wait 2>/dev/null
 }
 
+sctp_listener_ready()
+{
+	ss -S -N "$1" -ln -o "sport = :12345" | grep -q 12345
+}
+
+test_sctp_forward()
+{
+	ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet sctpq {
+        chain forward {
+        type filter hook forward priority 0; policy accept;
+                sctp dport 12345 queue num 10
+        }
+}
+EOF
+	timeout 60 ip netns exec "$ns2" socat -u SCTP-LISTEN:12345 STDOUT > "$TMPFILE1" &
+	local rpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2"
+
+	ip netns exec "$nsrouter" ./nf_queue -q 10 -G -t "$timeout" &
+	local nfqpid=$!
+
+	ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
+
+	if ! ip netns exec "$nsrouter" nft delete table inet sctpq; then
+		echo "FAIL:  Could not delete sctpq table"
+		exit 1
+	fi
+
+	wait "$rpid" && echo "PASS: sctp and nfqueue in forward chain"
+
+	if ! diff -u "$TMPINPUT" "$TMPFILE1" ; then
+		echo "FAIL: lost packets?!" 1>&2
+		exit 1
+	fi
+}
+
+test_sctp_output()
+{
+        ip netns exec "$ns1" nft -f /dev/stdin <<EOF
+table inet sctpq {
+        chain output {
+        type filter hook output priority 0; policy accept;
+                sctp dport 12345 queue num 11
+        }
+}
+EOF
+	# reduce test file size, software segmentation causes sk wmem increase.
+	dd conv=sparse status=none if=/dev/zero bs=1M count=50 of="$TMPINPUT"
+
+	timeout 60 ip netns exec "$ns2" socat -u SCTP-LISTEN:12345 STDOUT > "$TMPFILE1" &
+	local rpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2"
+
+	ip netns exec "$ns1" ./nf_queue -q 11 -t "$timeout" &
+	local nfqpid=$!
+
+	ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
+
+	if ! ip netns exec "$ns1" nft delete table inet sctpq; then
+		echo "FAIL:  Could not delete sctpq table"
+		exit 1
+	fi
+
+	# must wait before checking completeness of output file.
+	wait "$rpid" && echo "PASS: sctp and nfqueue in output chain with GSO"
+
+	if ! diff -u "$TMPINPUT" "$TMPFILE1" ; then
+		echo "FAIL: lost packets?!" 1>&2
+		exit 1
+	fi
+}
+
 test_queue_removal()
 {
 	read tainted_then < /proc/sys/kernel/tainted
@@ -443,11 +521,16 @@ test_queue 10
 # same.  We queue to a second program as well.
 load_ruleset "filter2" 20
 test_queue 20
+ip netns exec "$ns1" nft flush ruleset
 
 test_tcp_forward
 test_tcp_localhost
 test_tcp_localhost_connectclose
 test_tcp_localhost_requeue
+test_sctp_forward
+test_sctp_output
+
+# should be last, adds vrf device in ns1 and changes routes
 test_icmp_vrf
 test_queue_removal
 
-- 
cgit v1.2.3


From 2518e13275ab9ea6b2540f828cf78b0280991f85 Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2@nxp.com>
Date: Mon, 19 Aug 2024 10:34:03 +0800
Subject: perf python: Fix the build on 32-bit arm by including missing
 "util/sample.h"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 32-bit arm build system will complain:

  tools/perf/util/python.c:75:28: error: field ‘sample’ has incomplete type
     75 |         struct perf_sample sample;

However, arm64 build system doesn't complain this.

The root cause is arm64 define "HAVE_KVM_STAT_SUPPORT := 1" in
tools/perf/arch/arm64/Makefile, but arm arch doesn't define this.  This
will lead to kvm-stat.h include other header files on arm64 build
system, especially "util/sample.h" for util/python.c.

This will try to directly include "util/sample.h" for "util/python.c" to
avoid such build issue on arm platform.

Signed-off-by: Xu Yang <xu.yang_2@nxp.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: imx@lists.linux.dev
Link: https://lore.kernel.org/r/20240819023403.201324-1-xu.yang_2@nxp.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/python.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 3be882b2e845..31a223eaf8e6 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -20,6 +20,7 @@
 #include "util/env.h"
 #include "util/kvm-stat.h"
 #include "util/kwork.h"
+#include "util/sample.h"
 #include "util/lock-contention.h"
 #include <internal/lib.h>
 #include "../builtin.h"
-- 
cgit v1.2.3


From 2aebebb834e208344eeabcc4cacfc680026c8f6f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:30 -0700
Subject: perf synthetic-events: Avoid unnecessary memset

Make sure the memset of a synthesized event only zeros the necessary
tracing data part of the event, as a full event can be over 4kb in
size.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/synthetic-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 7f884d70de81..0a7f93ae76fb 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -2200,7 +2200,7 @@ int perf_event__synthesize_tracing_data(const struct perf_tool *tool, int fd, st
 	if (!tdata)
 		return -1;
 
-	memset(&ev, 0, sizeof(ev));
+	memset(&ev, 0, sizeof(ev.tracing_data));
 
 	ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
 	size = tdata->size;
-- 
cgit v1.2.3


From a031073626d135b332c1004dd6ade1b1e8d7c9f8 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:31 -0700
Subject: perf map: API clean up

map__init() is only used internally so make it static. Assume memory is
zero initialized, which will better support adding fields to struct
map in the future and was already the case for map__new2.

To reduce complexity, change set_priv and set_erange_warned to not take
a value to assign as they always assign true.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c            |  2 +-
 tools/perf/tests/vmlinux-kallsyms.c |  4 ++--
 tools/perf/util/map.c               | 24 ++++++++++++------------
 tools/perf/util/map.h               | 11 ++++-------
 4 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 881b861c35ee..724a79386321 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -191,7 +191,7 @@ static void ui__warn_map_erange(struct map *map, struct symbol *sym, u64 ip)
 	if (use_browser <= 0)
 		sleep(5);
 
-	map__set_erange_warned(map, true);
+	map__set_erange_warned(map);
 }
 
 static void perf_top__record_precise_ip(struct perf_top *top,
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index cd3b480d20bd..74cdbd2ce9d0 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -131,7 +131,7 @@ static int test__vmlinux_matches_kallsyms_cb1(struct map *map, void *data)
 					(dso__kernel(dso) ? dso__short_name(dso) : dso__name(dso)));
 
 	if (pair) {
-		map__set_priv(pair, 1);
+		map__set_priv(pair);
 		map__put(pair);
 	} else {
 		if (!args->header_printed) {
@@ -166,7 +166,7 @@ static int test__vmlinux_matches_kallsyms_cb2(struct map *map, void *data)
 			pr_info(":\nWARN: *%" PRIx64 "-%" PRIx64 " %" PRIx64,
 				map__start(pair), map__end(pair), map__pgoff(pair));
 		pr_info(" %s\n", dso__name(dso));
-		map__set_priv(pair, 1);
+		map__set_priv(pair);
 	}
 	map__put(pair);
 	return 0;
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index e1d14936a60d..e781c8d56a9a 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -102,16 +102,20 @@ static inline bool replace_android_lib(const char *filename, char *newfilename)
 	return false;
 }
 
-void map__init(struct map *map, u64 start, u64 end, u64 pgoff, struct dso *dso)
+static void map__init(struct map *map, u64 start, u64 end, u64 pgoff,
+		      struct dso *dso, u32 prot, u32 flags)
 {
 	map__set_start(map, start);
 	map__set_end(map, end);
 	map__set_pgoff(map, pgoff);
-	map__set_reloc(map, 0);
+	assert(map__reloc(map) == 0);
 	map__set_dso(map, dso__get(dso));
-	map__set_mapping_type(map, MAPPING_TYPE__DSO);
-	map__set_erange_warned(map, false);
 	refcount_set(map__refcnt(map), 1);
+	RC_CHK_ACCESS(map)->prot = prot;
+	RC_CHK_ACCESS(map)->flags = flags;
+	map__set_mapping_type(map, MAPPING_TYPE__DSO);
+	assert(map__erange_warned(map) == false);
+	assert(map__priv(map) == false);
 }
 
 struct map *map__new(struct machine *machine, u64 start, u64 len,
@@ -124,7 +128,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 	struct nsinfo *nsi = NULL;
 	struct nsinfo *nnsi;
 
-	map = malloc(sizeof(*map));
+	map = zalloc(sizeof(*map));
 	if (ADD_RC_CHK(result, map)) {
 		char newfilename[PATH_MAX];
 		struct dso *dso, *header_bid_dso;
@@ -134,8 +138,6 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 		anon = is_anon_memory(filename) || flags & MAP_HUGETLB;
 		vdso = is_vdso_map(filename);
 		no_dso = is_no_dso_memory(filename);
-		map->prot = prot;
-		map->flags = flags;
 		nsi = nsinfo__get(thread__nsinfo(thread));
 
 		if ((anon || no_dso) && nsi && (prot & PROT_EXEC)) {
@@ -169,7 +171,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 			goto out_delete;
 
 		assert(!dso__kernel(dso));
-		map__init(result, start, start + len, pgoff, dso);
+		map__init(result, start, start + len, pgoff, dso, prot, flags);
 
 		if (anon || no_dso) {
 			map->mapping_type = MAPPING_TYPE__IDENTITY;
@@ -223,10 +225,8 @@ struct map *map__new2(u64 start, struct dso *dso)
 
 	map = calloc(1, sizeof(*map) + (dso__kernel(dso) ? sizeof(struct kmap) : 0));
 	if (ADD_RC_CHK(result, map)) {
-		/*
-		 * ->end will be filled after we load all the symbols
-		 */
-		map__init(result, start, 0, 0, dso);
+		/* ->end will be filled after we load all the symbols. */
+		map__init(result, start, /*end=*/0, /*pgoff=*/0, dso, /*prot=*/0, /*flags=*/0);
 	}
 
 	return result;
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 65e2609fa1b1..6c43f31a9fe0 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -166,9 +166,6 @@ struct thread;
 #define map__for_each_symbol_by_name(map, sym_name, pos, idx)	\
 	__map__for_each_symbol_by_name(map, sym_name, (pos), idx)
 
-void map__init(struct map *map,
-	       u64 start, u64 end, u64 pgoff, struct dso *dso);
-
 struct dso_id;
 struct build_id;
 
@@ -285,14 +282,14 @@ static inline void map__set_reloc(struct map *map, u64 reloc)
 	RC_CHK_ACCESS(map)->reloc = reloc;
 }
 
-static inline void map__set_priv(struct map *map, int priv)
+static inline void map__set_priv(struct map *map)
 {
-	RC_CHK_ACCESS(map)->priv = priv;
+	RC_CHK_ACCESS(map)->priv = true;
 }
 
-static inline void map__set_erange_warned(struct map *map, bool erange_warned)
+static inline void map__set_erange_warned(struct map *map)
 {
-	RC_CHK_ACCESS(map)->erange_warned = erange_warned;
+	RC_CHK_ACCESS(map)->erange_warned = true;
 }
 
 static inline void map__set_dso(struct map *map, struct dso *dso)
-- 
cgit v1.2.3


From 0847c193c3d777b79af9d019d6509b5857591f33 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:32 -0700
Subject: perf jit: Constify filename argument

Make it clearer the argument is just being used as a string.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-4-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/jit.h     | 3 ++-
 tools/perf/util/jitdump.c | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/jit.h b/tools/perf/util/jit.h
index fb810e1b2de7..f4037203e9ec 100644
--- a/tools/perf/util/jit.h
+++ b/tools/perf/util/jit.h
@@ -5,7 +5,8 @@
 #include <data.h>
 
 int jit_process(struct perf_session *session, struct perf_data *output,
-		struct machine *machine, char *filename, pid_t pid, pid_t tid, u64 *nbytes);
+		struct machine *machine, const char *filename,
+		pid_t pid, pid_t tid, u64 *nbytes);
 
 int jit_inject_record(const char *filename);
 
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index 5ce13653512b..346513e5e9b7 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -710,7 +710,7 @@ jit_process_dump(struct jit_buf_desc *jd)
 }
 
 static int
-jit_inject(struct jit_buf_desc *jd, char *path)
+jit_inject(struct jit_buf_desc *jd, const char *path)
 {
 	int ret;
 
@@ -737,7 +737,7 @@ jit_inject(struct jit_buf_desc *jd, char *path)
  * as captured in the RECORD_MMAP record
  */
 static int
-jit_detect(char *mmap_name, pid_t pid, struct nsinfo *nsi)
+jit_detect(const char *mmap_name, pid_t pid, struct nsinfo *nsi)
  {
 	char *p;
 	char *end = NULL;
@@ -821,7 +821,7 @@ int
 jit_process(struct perf_session *session,
 	    struct perf_data *output,
 	    struct machine *machine,
-	    char *filename,
+	    const char *filename,
 	    pid_t pid,
 	    pid_t tid,
 	    u64 *nbytes)
-- 
cgit v1.2.3


From e4bb4caa54b24b839c21612d816b40673a75f6d4 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:33 -0700
Subject: perf dso: Constify dso_id

The passed dso_id is copied and so is never an out argument. Remove
its mutability.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-5-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c |  2 +-
 tools/perf/util/dso.c       |  4 ++--
 tools/perf/util/dso.h       |  4 ++--
 tools/perf/util/dsos.c      | 12 ++++++------
 tools/perf/util/dsos.h      |  2 +-
 tools/perf/util/machine.c   |  3 ++-
 tools/perf/util/machine.h   |  3 ++-
 7 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index a35bde3f3c09..8440ddfbf4fe 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -411,7 +411,7 @@ static int perf_event__jit_repipe_mmap(const struct perf_tool *tool,
 #endif
 
 static struct dso *findnew_dso(int pid, int tid, const char *filename,
-			       struct dso_id *id, struct machine *machine)
+			       const struct dso_id *id, struct machine *machine)
 {
 	struct thread *thread;
 	struct nsinfo *nsi = NULL;
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 67414944f245..5c6e85fdae0d 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1327,7 +1327,7 @@ bool dso_id__empty(const struct dso_id *id)
 	return !id->maj && !id->min && !id->ino && !id->ino_generation;
 }
 
-void __dso__inject_id(struct dso *dso, struct dso_id *id)
+void __dso__inject_id(struct dso *dso, const struct dso_id *id)
 {
 	struct dsos *dsos = dso__dsos(dso);
 	struct dso_id *dso_id = dso__id(dso);
@@ -1417,7 +1417,7 @@ void dso__set_sorted_by_name(struct dso *dso)
 	RC_CHK_ACCESS(dso)->sorted_by_name = true;
 }
 
-struct dso *dso__new_id(const char *name, struct dso_id *id)
+struct dso *dso__new_id(const char *name, const struct dso_id *id)
 {
 	RC_STRUCT(dso) *dso = zalloc(sizeof(*dso) + strlen(name) + 1);
 	struct dso *res;
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index ed0068251c65..bb8e8f444054 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -640,14 +640,14 @@ static inline void dso__set_text_offset(struct dso *dso, u64 val)
 int dso_id__cmp(const struct dso_id *a, const struct dso_id *b);
 bool dso_id__empty(const struct dso_id *id);
 
-struct dso *dso__new_id(const char *name, struct dso_id *id);
+struct dso *dso__new_id(const char *name, const struct dso_id *id);
 struct dso *dso__new(const char *name);
 void dso__delete(struct dso *dso);
 
 int dso__cmp_id(struct dso *a, struct dso *b);
 void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated);
 void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated);
-void __dso__inject_id(struct dso *dso, struct dso_id *id);
+void __dso__inject_id(struct dso *dso, const struct dso_id *id);
 
 int dso__name_len(const struct dso *dso);
 
diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c
index d4acdb37f046..e0998e2a7c4e 100644
--- a/tools/perf/util/dsos.c
+++ b/tools/perf/util/dsos.c
@@ -155,7 +155,7 @@ static int dsos__cmp_key_long_name_id(const void *vkey, const void *vdso)
  */
 static struct dso *__dsos__find_by_longname_id(struct dsos *dsos,
 					       const char *name,
-					       struct dso_id *id,
+					       const struct dso_id *id,
 					       bool write_locked)
 {
 	struct dsos__key key = {
@@ -244,7 +244,7 @@ int dsos__add(struct dsos *dsos, struct dso *dso)
 
 struct dsos__find_id_cb_args {
 	const char *name;
-	struct dso_id *id;
+	const struct dso_id *id;
 	struct dso *res;
 };
 
@@ -260,7 +260,7 @@ static int dsos__find_id_cb(struct dso *dso, void *data)
 
 }
 
-static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct dso_id *id,
+static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, const struct dso_id *id,
 				   bool cmp_short, bool write_locked)
 {
 	struct dso *res;
@@ -321,7 +321,7 @@ static void dso__set_basename(struct dso *dso)
 	dso__set_short_name(dso, base, true);
 }
 
-static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct dso_id *id)
+static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, const struct dso_id *id)
 {
 	struct dso *dso = dso__new_id(name, id);
 
@@ -337,7 +337,7 @@ static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct
 	return dso;
 }
 
-static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id)
+static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, const struct dso_id *id)
 {
 	struct dso *dso = __dsos__find_id(dsos, name, id, false, /*write_locked=*/true);
 
@@ -347,7 +347,7 @@ static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struc
 	return dso ? dso : __dsos__addnew_id(dsos, name, id);
 }
 
-struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id)
+struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, const struct dso_id *id)
 {
 	struct dso *dso;
 	down_write(&dsos->lock);
diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h
index 6c13b65648bc..a26774950866 100644
--- a/tools/perf/util/dsos.h
+++ b/tools/perf/util/dsos.h
@@ -32,7 +32,7 @@ int __dsos__add(struct dsos *dsos, struct dso *dso);
 int dsos__add(struct dsos *dsos, struct dso *dso);
 struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short);
 
-struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id);
+struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, const struct dso_id *id);
  
 bool dsos__read_build_ids(struct dsos *dsos, bool with_hits);
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index c08774d06d14..cd79a830abae 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -3128,7 +3128,8 @@ out:
 	return addr_cpumode;
 }
 
-struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, struct dso_id *id)
+struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename,
+				    const struct dso_id *id)
 {
 	return dsos__findnew_id(&machine->dsos, filename, id);
 }
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 82a47bac8023..a687876e3453 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -207,7 +207,8 @@ int machine__nr_cpus_avail(struct machine *machine);
 
 struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
 
-struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, struct dso_id *id);
+struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename,
+				    const struct dso_id *id);
 struct dso *machine__findnew_dso(struct machine *machine, const char *filename);
 
 size_t machine__fprintf(struct machine *machine, FILE *fp);
-- 
cgit v1.2.3


From 63c89dc5e129a73cd3f8528640d2496543b79706 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:34 -0700
Subject: perf evsel: Constify evsel__id_hdr_size() argument

Allows evsel__id_hdr_size() to be used when the evsel is const.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-6-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 2 +-
 tools/perf/util/evsel.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 38a74d6dde49..49cc71511c0c 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -3048,7 +3048,7 @@ int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event,
 	return 0;
 }
 
-u16 evsel__id_hdr_size(struct evsel *evsel)
+u16 evsel__id_hdr_size(const struct evsel *evsel)
 {
 	u64 sample_type = evsel->core.attr.sample_type;
 	u16 size = 0;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 4316992f6a69..15acf293e12a 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -442,7 +442,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event,
 				  u64 *timestamp);
 
-u16 evsel__id_hdr_size(struct evsel *evsel);
+u16 evsel__id_hdr_size(const struct evsel *evsel);
 
 static inline struct evsel *evsel__next(struct evsel *evsel)
 {
-- 
cgit v1.2.3


From a8656614ebe017a0487ba9f85213910c09e89c59 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:35 -0700
Subject: perf test: Expand pipe/inject test

Test recording of call-graphs and injecting --build-all. Add/expand
trap handler.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-7-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/pipe_test.sh | 101 ++++++++++++++++++++++++++++--------
 1 file changed, 79 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/pipe_test.sh b/tools/perf/tests/shell/pipe_test.sh
index a78d35d2cff0..ad10012fdc29 100755
--- a/tools/perf/tests/shell/pipe_test.sh
+++ b/tools/perf/tests/shell/pipe_test.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # perf pipe recording and injection test
 # SPDX-License-Identifier: GPL-2.0
 
@@ -12,30 +12,87 @@ skip_test_missing_symbol ${sym}
 
 data=$(mktemp /tmp/perf.data.XXXXXX)
 prog="perf test -w noploop"
-task="perf"
+err=0
 
-if ! perf record -e task-clock:u -o - ${prog} | perf report -i - --task | grep ${task}; then
-	echo "cannot find the test file in the perf report"
-	exit 1
-fi
+set -e
 
-if ! perf record -e task-clock:u -o - ${prog} | perf inject -b | perf report -i - | grep ${sym}; then
-	echo "cannot find noploop function in pipe #1"
-	exit 1
-fi
+cleanup() {
+  rm -rf "${data}"
+  rm -rf "${data}".old
 
-perf record -e task-clock:u -o - ${prog} | perf inject -b -o ${data}
-if ! perf report -i ${data} | grep ${sym}; then
-	echo "cannot find noploop function in pipe #2"
-	exit 1
-fi
+  trap - EXIT TERM INT
+}
 
-perf record -e task-clock:u -o ${data} ${prog}
-if ! perf inject -b -i ${data} | perf report -i - | grep ${sym}; then
-	echo "cannot find noploop function in pipe #3"
-	exit 1
-fi
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+trap trap_cleanup EXIT TERM INT
 
+test_record_report() {
+  echo
+  echo "Record+report pipe test"
+
+  task="perf"
+  if ! perf record -e task-clock:u -o - ${prog} | perf report -i - --task | grep -q ${task}
+  then
+    echo "Record+report pipe test [Failed - cannot find the test file in the perf report #1]"
+    err=1
+    return
+  fi
+
+  if ! perf record -g -e task-clock:u -o - ${prog} | perf report -i - --task | grep -q ${task}
+  then
+    echo "Record+report pipe test [Failed - cannot find the test file in the perf report #2]"
+    err=1
+    return
+  fi
+
+  echo "Record+report pipe test [Success]"
+}
+
+test_inject_bids() {
+  inject_opt=$1
+
+  echo
+  echo "Inject ${inject_opt} build-ids test"
+
+  if ! perf record -e task-clock:u -o - ${prog} | perf inject ${inject_opt}| perf report -i - | grep -q ${sym}
+  then
+    echo "Inject build-ids test [Failed - cannot find noploop function in pipe #1]"
+    err=1
+    return
+  fi
+
+  if ! perf record -g -e task-clock:u -o - ${prog} | perf inject ${inject_opt} | perf report -i - | grep -q ${sym}
+  then
+    echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #2]"
+    err=1
+    return
+  fi
+
+  perf record -e task-clock:u -o - ${prog} | perf inject ${inject_opt} -o ${data}
+  if ! perf report -i ${data} | grep -q ${sym}; then
+    echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #3]"
+    err=1
+    return
+  fi
+
+  perf record -e task-clock:u -o ${data} ${prog}
+  if ! perf inject ${inject_opt} -i ${data} | perf report -i - | grep -q ${sym}; then
+    echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #4]"
+    err=1
+    return
+  fi
+
+  echo "Inject ${inject_opt} build-ids test [Success]"
+}
+
+test_record_report
+test_inject_bids -b
+test_inject_bids --buildid-all
+
+cleanup
+exit $err
 
-rm -f ${data} ${data}.old
-exit 0
-- 
cgit v1.2.3


From 0ed4c8c31139e3360c5db9f536e8ff37c1824f8a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:36 -0700
Subject: perf inject: Combine build_ids and build_id_all into enum

It is clearer to have a single enum that determines how build ids are
injected, it also allows for future extension.

Set the header build ID feature whether lazy or all are generated,
previously only the lazy case would set it.

Allow parsing of known build IDs for either the lazy or all cases.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-8-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 51 +++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 8440ddfbf4fe..865d16ceead2 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -103,11 +103,16 @@ struct guest_session {
 	struct guest_event		ev;
 };
 
+enum build_id_rewrite_style {
+	BID_RWS__NONE = 0,
+	BID_RWS__INJECT_HEADER_LAZY,
+	BID_RWS__INJECT_HEADER_ALL,
+};
+
 struct perf_inject {
 	struct perf_tool	tool;
 	struct perf_session	*session;
-	bool			build_ids;
-	bool			build_id_all;
+	enum build_id_rewrite_style build_id_style;
 	bool			sched_stat;
 	bool			have_auxtrace;
 	bool			strip;
@@ -2015,8 +2020,8 @@ static int __cmd_inject(struct perf_inject *inject)
 
 	signal(SIGINT, sig_handler);
 
-	if (inject->build_ids || inject->sched_stat ||
-	    inject->itrace_synth_opts.set || inject->build_id_all) {
+	if (inject->build_id_style != BID_RWS__NONE || inject->sched_stat ||
+	    inject->itrace_synth_opts.set) {
 		inject->tool.mmap	  = perf_event__repipe_mmap;
 		inject->tool.mmap2	  = perf_event__repipe_mmap2;
 		inject->tool.fork	  = perf_event__repipe_fork;
@@ -2027,10 +2032,10 @@ static int __cmd_inject(struct perf_inject *inject)
 
 	output_data_offset = perf_session__data_offset(session->evlist);
 
-	if (inject->build_id_all) {
+	if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) {
 		inject->tool.mmap	  = perf_event__repipe_buildid_mmap;
 		inject->tool.mmap2	  = perf_event__repipe_buildid_mmap2;
-	} else if (inject->build_ids) {
+	} else if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
 		inject->tool.sample = perf_event__inject_buildid;
 	} else if (inject->sched_stat) {
 		struct evsel *evsel;
@@ -2148,9 +2153,9 @@ static int __cmd_inject(struct perf_inject *inject)
 			.inject = inject,
 		};
 
-		if (inject->build_ids)
-			perf_header__set_feat(&session->header,
-					      HEADER_BUILD_ID);
+		if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY ||
+		    inject->build_id_style == BID_RWS__INJECT_HEADER_ALL)
+			perf_header__set_feat(&session->header, HEADER_BUILD_ID);
 		/*
 		 * Keep all buildids when there is unprocessed AUX data because
 		 * it is not known which ones the AUX trace hits.
@@ -2211,11 +2216,13 @@ int cmd_inject(int argc, const char **argv)
 	int ret;
 	bool repipe = true;
 	const char *known_build_ids = NULL;
+	bool build_ids;
+	bool build_id_all;
 
 	struct option options[] = {
-		OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
+		OPT_BOOLEAN('b', "build-ids", &build_ids,
 			    "Inject build-ids into the output stream"),
-		OPT_BOOLEAN(0, "buildid-all", &inject.build_id_all,
+		OPT_BOOLEAN(0, "buildid-all", &build_id_all,
 			    "Inject build-ids of all DSOs into the output stream"),
 		OPT_STRING(0, "known-build-ids", &known_build_ids,
 			   "buildid path [,buildid path...]",
@@ -2313,6 +2320,10 @@ int cmd_inject(int argc, const char **argv)
 			return -1;
 		}
 	}
+	if (build_ids)
+		inject.build_id_style = BID_RWS__INJECT_HEADER_LAZY;
+	if (build_id_all)
+		inject.build_id_style = BID_RWS__INJECT_HEADER_ALL;
 
 	data.path = inject.input_name;
 	if (!strcmp(inject.input_name, "-") || inject.output.is_pipe) {
@@ -2326,7 +2337,7 @@ int cmd_inject(int argc, const char **argv)
 			repipe = false;
 	}
 	ordered_events = inject.jit_mode || inject.sched_stat ||
-		(inject.build_ids && !inject.build_id_all);
+		(inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY);
 	perf_tool__init(&inject.tool, ordered_events);
 	inject.tool.sample		= perf_event__repipe_sample;
 	inject.tool.read		= perf_event__repipe_sample;
@@ -2398,7 +2409,7 @@ int cmd_inject(int argc, const char **argv)
 			goto out_delete;
 	}
 
-	if (inject.build_ids && !inject.build_id_all) {
+	if (inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
 		/*
 		 * to make sure the mmap records are ordered correctly
 		 * and so that the correct especially due to jitted code
@@ -2406,14 +2417,14 @@ int cmd_inject(int argc, const char **argv)
 		 * inject the jit mmaps at the same time for now.
 		 */
 		inject.tool.ordering_requires_timestamps = true;
-		if (known_build_ids != NULL) {
-			inject.known_build_ids =
-				perf_inject__parse_known_build_ids(known_build_ids);
+	}
+	if (inject.build_id_style != BID_RWS__NONE && known_build_ids != NULL) {
+		inject.known_build_ids =
+			perf_inject__parse_known_build_ids(known_build_ids);
 
-			if (inject.known_build_ids == NULL) {
-				pr_err("Couldn't parse known build ids.\n");
-				goto out_delete;
-			}
+		if (inject.known_build_ids == NULL) {
+			pr_err("Couldn't parse known build ids.\n");
+			goto out_delete;
 		}
 	}
 
-- 
cgit v1.2.3


From 048a7a9363a276ffc15e137ae8544d2cd7c28b67 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:37 -0700
Subject: perf inject: Combine different mmap and mmap2 functions

There are repipe, build ID and JIT dump variants of the mmap and mmap2
repipe functions. The organization doesn't allow JIT dump to work with
build ID injection and the structure is less than clear. Combine the
function and enable the different behaviors based on ifs.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-9-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 199 +++++++++++++++++---------------------------
 1 file changed, 76 insertions(+), 123 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 865d16ceead2..d99868953ff2 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -377,44 +377,6 @@ static int perf_event__repipe_sample(const struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
-static int perf_event__repipe_mmap(const struct perf_tool *tool,
-				   union perf_event *event,
-				   struct perf_sample *sample,
-				   struct machine *machine)
-{
-	int err;
-
-	err = perf_event__process_mmap(tool, event, sample, machine);
-	perf_event__repipe(tool, event, sample, machine);
-
-	return err;
-}
-
-#ifdef HAVE_JITDUMP
-static int perf_event__jit_repipe_mmap(const struct perf_tool *tool,
-				       union perf_event *event,
-				       struct perf_sample *sample,
-				       struct machine *machine)
-{
-	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
-	u64 n = 0;
-	int ret;
-
-	/*
-	 * if jit marker, then inject jit mmaps and generate ELF images
-	 */
-	ret = jit_process(inject->session, &inject->output, machine,
-			  event->mmap.filename, event->mmap.pid, event->mmap.tid, &n);
-	if (ret < 0)
-		return ret;
-	if (ret) {
-		inject->bytes_written += n;
-		return 0;
-	}
-	return perf_event__repipe_mmap(tool, event, sample, machine);
-}
-#endif
-
 static struct dso *findnew_dso(int pid, int tid, const char *filename,
 			       const struct dso_id *id, struct machine *machine)
 {
@@ -460,114 +422,108 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename,
 	return dso;
 }
 
-static int perf_event__repipe_buildid_mmap(const struct perf_tool *tool,
-					   union perf_event *event,
-					   struct perf_sample *sample,
-					   struct machine *machine)
-{
-	struct dso *dso;
-
-	dso = findnew_dso(event->mmap.pid, event->mmap.tid,
-			  event->mmap.filename, NULL, machine);
-
-	if (dso && !dso__hit(dso)) {
-		dso__set_hit(dso);
-		dso__inject_build_id(dso, tool, machine, sample->cpumode, 0);
-	}
-	dso__put(dso);
-
-	return perf_event__repipe(tool, event, sample, machine);
-}
-
-static int perf_event__repipe_mmap2(const struct perf_tool *tool,
+static int perf_event__repipe_mmap(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine)
 {
-	int err;
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 
-	err = perf_event__process_mmap2(tool, event, sample, machine);
-	perf_event__repipe(tool, event, sample, machine);
+#ifdef HAVE_JITDUMP
+	if (inject->jit_mode) {
+		u64 n = 0;
+		int ret;
 
-	if (event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID) {
-		struct dso *dso;
+		/* If jit marker, then inject jit mmaps and generate ELF images. */
+		ret = jit_process(inject->session, &inject->output, machine,
+				  event->mmap.filename, event->mmap.pid, event->mmap.tid, &n);
+		if (ret < 0)
+			return ret;
+		if (ret) {
+			inject->bytes_written += n;
+			return 0;
+		}
+	}
+#endif
+	if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) {
+		struct dso *dso = findnew_dso(event->mmap.pid, event->mmap.tid,
+					      event->mmap.filename, NULL, machine);
 
-		dso = findnew_dso(event->mmap2.pid, event->mmap2.tid,
-				  event->mmap2.filename, NULL, machine);
-		if (dso) {
-			/* mark it not to inject build-id */
+		if (dso && !dso__hit(dso)) {
 			dso__set_hit(dso);
+			dso__inject_build_id(dso, tool, machine, sample->cpumode, 0);
 		}
 		dso__put(dso);
-	}
+	} else {
+		/* Create the thread, map, etc. Not done for the unordered inject all case. */
+		int err = perf_event__process_mmap(tool, event, sample, machine);
 
-	return err;
+		if (err)
+			return err;
+	}
+	return perf_event__repipe(tool, event, sample, machine);
 }
 
-#ifdef HAVE_JITDUMP
-static int perf_event__jit_repipe_mmap2(const struct perf_tool *tool,
-					union perf_event *event,
-					struct perf_sample *sample,
-					struct machine *machine)
+static int perf_event__repipe_mmap2(const struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct machine *machine)
 {
 	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
-	u64 n = 0;
-	int ret;
+	struct dso *dso = NULL;
 
-	/*
-	 * if jit marker, then inject jit mmaps and generate ELF images
-	 */
-	ret = jit_process(inject->session, &inject->output, machine,
-			  event->mmap2.filename, event->mmap2.pid, event->mmap2.tid, &n);
-	if (ret < 0)
-		return ret;
-	if (ret) {
-		inject->bytes_written += n;
-		return 0;
+#ifdef HAVE_JITDUMP
+	if (inject->jit_mode) {
+		u64 n = 0;
+		int ret;
+
+		/* If jit marker, then inject jit mmaps and generate ELF images. */
+		ret = jit_process(inject->session, &inject->output, machine,
+				event->mmap2.filename, event->mmap2.pid, event->mmap2.tid, &n);
+		if (ret < 0)
+			return ret;
+		if (ret) {
+			inject->bytes_written += n;
+			return 0;
+		}
 	}
-	return perf_event__repipe_mmap2(tool, event, sample, machine);
-}
 #endif
-
-static int perf_event__repipe_buildid_mmap2(const struct perf_tool *tool,
-					    union perf_event *event,
-					    struct perf_sample *sample,
-					    struct machine *machine)
-{
-	struct dso_id dso_id = {
-		.maj = event->mmap2.maj,
-		.min = event->mmap2.min,
-		.ino = event->mmap2.ino,
-		.ino_generation = event->mmap2.ino_generation,
-	};
-	struct dso *dso;
-
 	if (event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID) {
-		/* cannot use dso_id since it'd have invalid info */
 		dso = findnew_dso(event->mmap2.pid, event->mmap2.tid,
 				  event->mmap2.filename, NULL, machine);
 		if (dso) {
 			/* mark it not to inject build-id */
 			dso__set_hit(dso);
 		}
-		dso__put(dso);
-		perf_event__repipe(tool, event, sample, machine);
-		return 0;
 	}
+	if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) {
+		if (!(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) {
+			struct dso_id dso_id = {
+				.maj = event->mmap2.maj,
+				.min = event->mmap2.min,
+				.ino = event->mmap2.ino,
+				.ino_generation = event->mmap2.ino_generation,
+			};
+
+			dso = findnew_dso(event->mmap2.pid, event->mmap2.tid,
+					  event->mmap2.filename, &dso_id, machine);
+		}
+		if (dso && !dso__hit(dso)) {
+			dso__set_hit(dso);
+			dso__inject_build_id(dso, tool, machine, sample->cpumode,
+					event->mmap2.flags);
+		}
+	} else {
+		/* Create the thread, map, etc. Not done for the unordered inject all case. */
+		int err = perf_event__process_mmap(tool, event, sample, machine);
 
-	dso = findnew_dso(event->mmap2.pid, event->mmap2.tid,
-			  event->mmap2.filename, &dso_id, machine);
-
-	if (dso && !dso__hit(dso)) {
-		dso__set_hit(dso);
-		dso__inject_build_id(dso, tool, machine, sample->cpumode,
-				     event->mmap2.flags);
+		if (err) {
+			dso__put(dso);
+			return err;
+		}
 	}
 	dso__put(dso);
-
-	perf_event__repipe(tool, event, sample, machine);
-
-	return 0;
+	return perf_event__repipe(tool, event, sample, machine);
 }
 
 static int perf_event__repipe_fork(const struct perf_tool *tool,
@@ -2032,10 +1988,7 @@ static int __cmd_inject(struct perf_inject *inject)
 
 	output_data_offset = perf_session__data_offset(session->evlist);
 
-	if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) {
-		inject->tool.mmap	  = perf_event__repipe_buildid_mmap;
-		inject->tool.mmap2	  = perf_event__repipe_buildid_mmap2;
-	} else if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
+	if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
 		inject->tool.sample = perf_event__inject_buildid;
 	} else if (inject->sched_stat) {
 		struct evsel *evsel;
@@ -2430,8 +2383,8 @@ int cmd_inject(int argc, const char **argv)
 
 #ifdef HAVE_JITDUMP
 	if (inject.jit_mode) {
-		inject.tool.mmap2	   = perf_event__jit_repipe_mmap2;
-		inject.tool.mmap	   = perf_event__jit_repipe_mmap;
+		inject.tool.mmap2	   = perf_event__repipe_mmap2;
+		inject.tool.mmap	   = perf_event__repipe_mmap;
 		inject.tool.ordering_requires_timestamps = true;
 		/*
 		 * JIT MMAP injection injects all MMAP events in one go, so it
-- 
cgit v1.2.3


From 05c4cfeba097b986532c7a5609dcc5435c0a4dee Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Aug 2024 23:44:38 -0700
Subject: perf inject: Combine mmap and mmap2 handling

The handling of mmap and mmap2 events is near identical. Add a common
helper function and call that by the two event handling functions.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Yunseong Kim <yskelg@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Link: https://lore.kernel.org/r/20240817064442.2152089-10-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 118 +++++++++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 62 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index d99868953ff2..a7c859db2e15 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -422,55 +422,21 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename,
 	return dso;
 }
 
-static int perf_event__repipe_mmap(const struct perf_tool *tool,
-				   union perf_event *event,
-				   struct perf_sample *sample,
-				   struct machine *machine)
-{
-	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
-
-#ifdef HAVE_JITDUMP
-	if (inject->jit_mode) {
-		u64 n = 0;
-		int ret;
-
-		/* If jit marker, then inject jit mmaps and generate ELF images. */
-		ret = jit_process(inject->session, &inject->output, machine,
-				  event->mmap.filename, event->mmap.pid, event->mmap.tid, &n);
-		if (ret < 0)
-			return ret;
-		if (ret) {
-			inject->bytes_written += n;
-			return 0;
-		}
-	}
-#endif
-	if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) {
-		struct dso *dso = findnew_dso(event->mmap.pid, event->mmap.tid,
-					      event->mmap.filename, NULL, machine);
-
-		if (dso && !dso__hit(dso)) {
-			dso__set_hit(dso);
-			dso__inject_build_id(dso, tool, machine, sample->cpumode, 0);
-		}
-		dso__put(dso);
-	} else {
-		/* Create the thread, map, etc. Not done for the unordered inject all case. */
-		int err = perf_event__process_mmap(tool, event, sample, machine);
-
-		if (err)
-			return err;
-	}
-	return perf_event__repipe(tool, event, sample, machine);
-}
-
-static int perf_event__repipe_mmap2(const struct perf_tool *tool,
-				   union perf_event *event,
-				   struct perf_sample *sample,
-				   struct machine *machine)
+static int perf_event__repipe_common_mmap(const struct perf_tool *tool,
+					  union perf_event *event,
+					  struct perf_sample *sample,
+					  struct machine *machine,
+					  __u32 pid, __u32 tid, __u32 flags,
+					  const char *filename,
+					  const struct dso_id *dso_id,
+					  int (*perf_event_process)(const struct perf_tool *tool,
+								    union perf_event *event,
+								    struct perf_sample *sample,
+								    struct machine *machine))
 {
 	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	struct dso *dso = NULL;
+	bool dso_sought = false;
 
 #ifdef HAVE_JITDUMP
 	if (inject->jit_mode) {
@@ -479,7 +445,7 @@ static int perf_event__repipe_mmap2(const struct perf_tool *tool,
 
 		/* If jit marker, then inject jit mmaps and generate ELF images. */
 		ret = jit_process(inject->session, &inject->output, machine,
-				event->mmap2.filename, event->mmap2.pid, event->mmap2.tid, &n);
+				  filename, pid, tid, &n);
 		if (ret < 0)
 			return ret;
 		if (ret) {
@@ -489,33 +455,26 @@ static int perf_event__repipe_mmap2(const struct perf_tool *tool,
 	}
 #endif
 	if (event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID) {
-		dso = findnew_dso(event->mmap2.pid, event->mmap2.tid,
-				  event->mmap2.filename, NULL, machine);
+		dso = findnew_dso(pid, tid, filename, dso_id, machine);
+		dso_sought = true;
 		if (dso) {
 			/* mark it not to inject build-id */
 			dso__set_hit(dso);
 		}
 	}
 	if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) {
-		if (!(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) {
-			struct dso_id dso_id = {
-				.maj = event->mmap2.maj,
-				.min = event->mmap2.min,
-				.ino = event->mmap2.ino,
-				.ino_generation = event->mmap2.ino_generation,
-			};
-
-			dso = findnew_dso(event->mmap2.pid, event->mmap2.tid,
-					  event->mmap2.filename, &dso_id, machine);
+		if (!dso_sought) {
+			dso = findnew_dso(pid, tid, filename, dso_id, machine);
+			dso_sought = true;
 		}
+
 		if (dso && !dso__hit(dso)) {
 			dso__set_hit(dso);
-			dso__inject_build_id(dso, tool, machine, sample->cpumode,
-					event->mmap2.flags);
+			dso__inject_build_id(dso, tool, machine, sample->cpumode, flags);
 		}
 	} else {
 		/* Create the thread, map, etc. Not done for the unordered inject all case. */
-		int err = perf_event__process_mmap(tool, event, sample, machine);
+		int err = perf_event_process(tool, event, sample, machine);
 
 		if (err) {
 			dso__put(dso);
@@ -526,6 +485,41 @@ static int perf_event__repipe_mmap2(const struct perf_tool *tool,
 	return perf_event__repipe(tool, event, sample, machine);
 }
 
+static int perf_event__repipe_mmap(const struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct machine *machine)
+{
+	return perf_event__repipe_common_mmap(
+		tool, event, sample, machine,
+		event->mmap.pid, event->mmap.tid, /*flags=*/0,
+		event->mmap.filename, /*dso_id=*/NULL,
+		perf_event__process_mmap);
+}
+
+static int perf_event__repipe_mmap2(const struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct machine *machine)
+{
+	struct dso_id id;
+	struct dso_id *dso_id = NULL;
+
+	if (!(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) {
+		id.maj = event->mmap2.maj;
+		id.min = event->mmap2.min;
+		id.ino = event->mmap2.ino;
+		id.ino_generation = event->mmap2.ino_generation;
+		dso_id = &id;
+	}
+
+	return perf_event__repipe_common_mmap(
+		tool, event, sample, machine,
+		event->mmap2.pid, event->mmap2.tid, event->mmap2.flags,
+		event->mmap2.filename, dso_id,
+		perf_event__process_mmap2);
+}
+
 static int perf_event__repipe_fork(const struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
-- 
cgit v1.2.3


From 3432bae89e044819f0b30f4c09260d2740896797 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 18 Aug 2024 14:29:47 -0700
Subject: perf record: Fix sample cgroup & namespace tracking

The recent change in 'struct perf_tool' constification broke the cgroup
and/or namespace tracking by resetting tool fields.  It should set the
values after perf_tool__init().

Fixes: cecb1cf154b301c6 ("perf record: Use perf_tool__init()")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240818212948.2873156-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 39367709fd99..adbaf80b398c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -2374,13 +2374,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	signal(SIGTERM, sig_handler);
 	signal(SIGSEGV, sigsegv_handler);
 
-	if (rec->opts.record_namespaces)
-		tool->namespace_events = true;
-
 	if (rec->opts.record_cgroup) {
-#ifdef HAVE_FILE_HANDLE
-		tool->cgroup_events = true;
-#else
+#ifndef HAVE_FILE_HANDLE
 		pr_err("cgroup tracking is not supported\n");
 		return -1;
 #endif
@@ -2406,6 +2401,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	tool->mmap2		= build_id__process_mmap2;
 	tool->itrace_start	= process_timestamp_boundary;
 	tool->aux		= process_timestamp_boundary;
+	tool->namespace_events	= rec->opts.record_namespaces;
+	tool->cgroup_events	= rec->opts.record_cgroup;
 	session = perf_session__new(data, tool);
 	if (IS_ERR(session)) {
 		pr_err("Perf session creation failed.\n");
-- 
cgit v1.2.3


From 5cc698bad72667cf80097dece6b91efbb4a1a0a7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 18 Aug 2024 14:29:48 -0700
Subject: perf test: Add cgroup sampling test

Add it to the record.sh shell test to verify if it tracks cgroup
information correctly.  It records with --all-cgroups option can check
if it has PERF_RECORD_CGROUP and the names are not "unknown".

  $ sudo ./perf test -vv 95
   95: perf record tests:
  --- start ---
  test child forked, pid 2871922
   169c90-169cd0 g test_loop
  perf does have symbol 'test_loop'
  Basic --per-thread mode test
  Basic --per-thread mode test [Success]
  Register capture test
  Register capture test [Success]
  Basic --system-wide mode test
  Basic --system-wide mode test [Success]
  Basic target workload test
  Basic target workload test [Success]
  Branch counter test
  branch counter feature not supported on all core PMUs (/sys/bus/event_source/devices/cpu) [Skipped]
  Cgroup sampling test
  Cgroup sampling test [Success]
  ---- end(0) ----
   95: perf record tests                                               : Ok

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240818212948.2873156-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record.sh | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh
index 36883b03169f..048078ee2eca 100755
--- a/tools/perf/tests/shell/record.sh
+++ b/tools/perf/tests/shell/record.sh
@@ -206,6 +206,28 @@ test_branch_counter() {
   echo "Branch counter test [Success]"
 }
 
+test_cgroup() {
+  echo "Cgroup sampling test"
+  if ! perf record -aB --synth=cgroup --all-cgroups -o "${perfdata}" ${testprog} 2> /dev/null
+  then
+    echo "Cgroup sampling [Skipped not supported]"
+    return
+  fi
+  if ! perf report -i "${perfdata}" -D | grep -q "CGROUP"
+  then
+    echo "Cgroup sampling [Failed missing output]"
+    err=1
+    return
+  fi
+  if ! perf script -i "${perfdata}" -F cgroup | grep -q -v "unknown"
+  then
+    echo "Cgroup sampling [Failed cannot resolve cgroup names]"
+    err=1
+    return
+  fi
+  echo "Cgroup sampling test [Success]"
+}
+
 # raise the limit of file descriptors to minimum
 if [[ $default_fd_limit -lt $min_fd_limit ]]; then
        ulimit -Sn $min_fd_limit
@@ -216,6 +238,7 @@ test_register_capture
 test_system_wide
 test_workload
 test_branch_counter
+test_cgroup
 
 # restore the default value
 ulimit -Sn $default_fd_limit
-- 
cgit v1.2.3


From 2aa93695081d4bd62350b41d58684d802be4563f Mon Sep 17 00:00:00 2001
From: Cupertino Miranda <cupertino.miranda@oracle.com>
Date: Mon, 19 Aug 2024 16:11:27 +0100
Subject: selftests/bpf: Disable strict aliasing for verifier_nocsr.c

verfifier_nocsr.c fails to compile in GCC. The reason behind it was
initially explained in commit 27a90b14b93d3b2e1efd10764e456af7e2a42991.

"A few BPF selftests perform type punning and they may break strict
aliasing rules, which are exploited by both GCC and clang by default
while optimizing.  This can lead to broken compiled programs."

Signed-off-by: Cupertino Miranda <cupertino.miranda@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240819151129.1366484-2-cupertino.miranda@oracle.com
---
 tools/testing/selftests/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 4eceb491a8ae..7bca9a2b8bc0 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -58,6 +58,7 @@ progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing
 progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing
 progs/timer_crash.c-CFLAGS := -fno-strict-aliasing
 progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing
+progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing
 
 ifneq ($(LLVM),)
 # Silence some warnings when compiled with clang
-- 
cgit v1.2.3


From d9075ac631ce0c5c2c17aa6221282a7c4ba8fa70 Mon Sep 17 00:00:00 2001
From: Cupertino Miranda <cupertino.miranda@oracle.com>
Date: Mon, 19 Aug 2024 16:11:29 +0100
Subject: selftest/bpf: Adapt inline asm operand constraint for GCC support

GCC errors when compiling tailcall_bpf2bpf_hierarchy2.c and
tailcall_bpf2bpf_hierarchy3.c with the following error:

progs/tailcall_bpf2bpf_hierarchy2.c: In function 'tailcall_bpf2bpf_hierarchy_2':
progs/tailcall_bpf2bpf_hierarchy2.c:66:9: error: input operand constraint contains '+'
   66 |         asm volatile (""::"r+"(ret));
      |         ^~~

Changed implementation to make use of __sink macro that abstracts the
desired behaviour.

The proposed change seems valid for both GCC and CLANG.

Signed-off-by: Cupertino Miranda <cupertino.miranda@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240819151129.1366484-4-cupertino.miranda@oracle.com
---
 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c | 4 ++--
 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
index 37604b0b97af..72fd0d577506 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
@@ -58,12 +58,12 @@ __retval(33)
 SEC("tc")
 int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb)
 {
-	volatile int ret = 0;
+	int ret = 0;
 
 	subprog_tail0(skb);
 	subprog_tail1(skb);
 
-	asm volatile (""::"r+"(ret));
+	__sink(ret);
 	return (count1 << 16) | count0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
index 0cdbb781fcbc..a7fb91cb05b7 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
@@ -51,11 +51,11 @@ __retval(33)
 SEC("tc")
 int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb)
 {
-	volatile int ret = 0;
+	int ret = 0;
 
 	bpf_tail_call_static(skb, &jmp_table0, 0);
 
-	asm volatile (""::"r+"(ret));
+	__sink(ret);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 190de5449973056f416c855b11fdfee2fc18f550 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Wed, 31 Jul 2024 12:01:26 +0200
Subject: selftests/bpf: Support more socket types in create_pair()

Extend the function to allow creating socket pairs of SOCK_STREAM,
SOCK_DGRAM and SOCK_SEQPACKET.

Adapt direct callers and leave further cleanups for the following patch.

Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-1-08a0c73abed2@rbox.co
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/sockmap_basic.c       |  19 +--
 .../selftests/bpf/prog_tests/sockmap_helpers.h     | 138 ++++++++++++++-------
 2 files changed, 96 insertions(+), 61 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 1337153eb0ad..5b17d69c9ee6 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -451,11 +451,11 @@ out:
 #define MAX_EVENTS 10
 static void test_sockmap_skb_verdict_shutdown(void)
 {
+	int n, err, map, verdict, c1 = -1, p1 = -1;
 	struct epoll_event ev, events[MAX_EVENTS];
-	int n, err, map, verdict, s, c1 = -1, p1 = -1;
 	struct test_sockmap_pass_prog *skel;
-	int epollfd;
 	int zero = 0;
+	int epollfd;
 	char b;
 
 	skel = test_sockmap_pass_prog__open_and_load();
@@ -469,10 +469,7 @@ static void test_sockmap_skb_verdict_shutdown(void)
 	if (!ASSERT_OK(err, "bpf_prog_attach"))
 		goto out;
 
-	s = socket_loopback(AF_INET, SOCK_STREAM);
-	if (s < 0)
-		goto out;
-	err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1);
+	err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1);
 	if (err < 0)
 		goto out;
 
@@ -570,16 +567,12 @@ out:
 
 static void test_sockmap_skb_verdict_peek_helper(int map)
 {
-	int err, s, c1, p1, zero = 0, sent, recvd, avail;
+	int err, c1, p1, zero = 0, sent, recvd, avail;
 	char snd[256] = "0123456789";
 	char rcv[256] = "0";
 
-	s = socket_loopback(AF_INET, SOCK_STREAM);
-	if (!ASSERT_GT(s, -1, "socket_loopback(s)"))
-		return;
-
-	err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1);
-	if (!ASSERT_OK(err, "create_pairs(s)"))
+	err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1);
+	if (!ASSERT_OK(err, "create_pair()"))
 		return;
 
 	err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
index e880f97bc44d..77b73333f091 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
@@ -3,6 +3,9 @@
 
 #include <linux/vm_sockets.h>
 
+/* include/linux/net.h */
+#define SOCK_TYPE_MASK 0xf
+
 #define IO_TIMEOUT_SEC 30
 #define MAX_STRERR_LEN 256
 #define MAX_TEST_NAME 80
@@ -312,54 +315,6 @@ static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2)
 	return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
 }
 
-static inline int create_pair(int s, int family, int sotype, int *c, int *p)
-{
-	struct sockaddr_storage addr;
-	socklen_t len;
-	int err = 0;
-
-	len = sizeof(addr);
-	err = xgetsockname(s, sockaddr(&addr), &len);
-	if (err)
-		return err;
-
-	*c = xsocket(family, sotype, 0);
-	if (*c < 0)
-		return errno;
-	err = xconnect(*c, sockaddr(&addr), len);
-	if (err) {
-		err = errno;
-		goto close_cli0;
-	}
-
-	*p = xaccept_nonblock(s, NULL, NULL);
-	if (*p < 0) {
-		err = errno;
-		goto close_cli0;
-	}
-	return err;
-close_cli0:
-	close(*c);
-	return err;
-}
-
-static inline int create_socket_pairs(int s, int family, int sotype,
-				      int *c0, int *c1, int *p0, int *p1)
-{
-	int err;
-
-	err = create_pair(s, family, sotype, c0, p0);
-	if (err)
-		return err;
-
-	err = create_pair(s, family, sotype, c1, p1);
-	if (err) {
-		close(*c0);
-		close(*p0);
-	}
-	return err;
-}
-
 static inline int enable_reuseport(int s, int progfd)
 {
 	int err, one = 1;
@@ -412,5 +367,92 @@ static inline int socket_loopback(int family, int sotype)
 	return socket_loopback_reuseport(family, sotype, -1);
 }
 
+static inline int create_pair(int family, int sotype, int *p0, int *p1)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int s, c, p, err;
+
+	s = socket_loopback(family, sotype);
+	if (s < 0)
+		return s;
+
+	err = xgetsockname(s, sockaddr(&addr), &len);
+	if (err)
+		goto close_s;
+
+	c = xsocket(family, sotype, 0);
+	if (c < 0) {
+		err = c;
+		goto close_s;
+	}
+
+	err = connect(c, sockaddr(&addr), len);
+	if (err) {
+		if (errno != EINPROGRESS) {
+			FAIL_ERRNO("connect");
+			goto close_c;
+		}
+
+		err = poll_connect(c, IO_TIMEOUT_SEC);
+		if (err) {
+			FAIL_ERRNO("poll_connect");
+			goto close_c;
+		}
+	}
+
+	switch (sotype & SOCK_TYPE_MASK) {
+	case SOCK_DGRAM:
+		err = xgetsockname(c, sockaddr(&addr), &len);
+		if (err)
+			goto close_c;
+
+		err = xconnect(s, sockaddr(&addr), len);
+		if (!err) {
+			*p0 = s;
+			*p1 = c;
+			return err;
+		}
+		break;
+	case SOCK_STREAM:
+	case SOCK_SEQPACKET:
+		p = xaccept_nonblock(s, NULL, NULL);
+		if (p >= 0) {
+			*p0 = p;
+			*p1 = c;
+			goto close_s;
+		}
+
+		err = p;
+		break;
+	default:
+		FAIL("Unsupported socket type %#x", sotype);
+		err = -EOPNOTSUPP;
+	}
+
+close_c:
+	close(c);
+close_s:
+	close(s);
+	return err;
+}
+
+static inline int create_socket_pairs(int s, int family, int sotype,
+				      int *c0, int *c1, int *p0, int *p1)
+{
+	int err;
+
+	err = create_pair(family, sotype, c0, p0);
+	if (err)
+		return err;
+
+	err = create_pair(family, sotype, c1, p1);
+	if (err) {
+		close(*c0);
+		close(*p0);
+	}
+
+	return err;
+}
 
 #endif // __SOCKMAP_HELPERS__
-- 
cgit v1.2.3


From b08f205e5b9a074ae89ad7f71002ca417a4a2700 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Wed, 31 Jul 2024 12:01:27 +0200
Subject: selftests/bpf: Socket pair creation, cleanups

Following create_pair() changes, remove unused function argument in
create_socket_pairs() and adapt its callers, i.e. drop the open-coded
loopback socket creation.

Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-2-08a0c73abed2@rbox.co
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/sockmap_basic.c       |  9 +++-----
 .../selftests/bpf/prog_tests/sockmap_helpers.h     |  4 ++--
 .../selftests/bpf/prog_tests/sockmap_listen.c      | 26 +++++++---------------
 3 files changed, 13 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 5b17d69c9ee6..82bfb266741c 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -503,8 +503,8 @@ out:
 
 static void test_sockmap_skb_verdict_fionread(bool pass_prog)
 {
+	int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1;
 	int expected, zero = 0, sent, recvd, avail;
-	int err, map, verdict, s, c0 = -1, c1 = -1, p0 = -1, p1 = -1;
 	struct test_sockmap_pass_prog *pass = NULL;
 	struct test_sockmap_drop_prog *drop = NULL;
 	char buf[256] = "0123456789";
@@ -531,11 +531,8 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
 	if (!ASSERT_OK(err, "bpf_prog_attach"))
 		goto out;
 
-	s = socket_loopback(AF_INET, SOCK_STREAM);
-	if (!ASSERT_GT(s, -1, "socket_loopback(s)"))
-		goto out;
-	err = create_socket_pairs(s, AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1);
-	if (!ASSERT_OK(err, "create_socket_pairs(s)"))
+	err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1);
+	if (!ASSERT_OK(err, "create_socket_pairs()"))
 		goto out;
 
 	err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
index 77b73333f091..ead8ea4fd0da 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
@@ -437,8 +437,8 @@ close_s:
 	return err;
 }
 
-static inline int create_socket_pairs(int s, int family, int sotype,
-				      int *c0, int *c1, int *p0, int *p1)
+static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1,
+				      int *p0, int *p1)
 {
 	int err;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index 9ce0e0e0b7da..bfbc217637d1 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -677,7 +677,7 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
 			       int verd_mapfd, enum redir_mode mode)
 {
 	const char *log_prefix = redir_mode_str(mode);
-	int s, c0, c1, p0, p1;
+	int c0, c1, p0, p1;
 	unsigned int pass;
 	int err, n;
 	u32 key;
@@ -685,13 +685,10 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
 
 	zero_verdict_count(verd_mapfd);
 
-	s = socket_loopback(family, sotype | SOCK_NONBLOCK);
-	if (s < 0)
-		return;
-
-	err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1);
+	err = create_socket_pairs(family, sotype | SOCK_NONBLOCK, &c0, &c1,
+				  &p0, &p1);
 	if (err)
-		goto close_srv;
+		return;
 
 	err = add_to_sockmap(sock_mapfd, p0, p1);
 	if (err)
@@ -722,8 +719,6 @@ close:
 	xclose(c1);
 	xclose(p0);
 	xclose(c0);
-close_srv:
-	xclose(s);
 }
 
 static void test_skb_redir_to_connected(struct test_sockmap_listen *skel,
@@ -909,7 +904,7 @@ static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *sk
 
 static void redir_partial(int family, int sotype, int sock_map, int parser_map)
 {
-	int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1;
+	int c0 = -1, c1 = -1, p0 = -1, p1 = -1;
 	int err, n, key, value;
 	char buf[] = "abc";
 
@@ -919,13 +914,10 @@ static void redir_partial(int family, int sotype, int sock_map, int parser_map)
 	if (err)
 		return;
 
-	s = socket_loopback(family, sotype | SOCK_NONBLOCK);
-	if (s < 0)
-		goto clean_parser_map;
-
-	err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1);
+	err = create_socket_pairs(family, sotype | SOCK_NONBLOCK, &c0, &c1,
+				  &p0, &p1);
 	if (err)
-		goto close_srv;
+		goto clean_parser_map;
 
 	err = add_to_sockmap(sock_map, p0, p1);
 	if (err)
@@ -944,8 +936,6 @@ close:
 	xclose(p0);
 	xclose(c1);
 	xclose(p1);
-close_srv:
-	xclose(s);
 
 clean_parser_map:
 	key = 0;
-- 
cgit v1.2.3


From 4e3dec2295b1fdf5ea5c40a8195bc13de7cffdeb Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Wed, 31 Jul 2024 12:01:28 +0200
Subject: selftests/bpf: Simplify inet_socketpair() and
 vsock_socketpair_connectible()

Replace implementation with a call to a generic function.

Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-3-08a0c73abed2@rbox.co
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/sockmap_listen.c      | 83 +---------------------
 1 file changed, 2 insertions(+), 81 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index bfbc217637d1..ea2faacd146d 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1490,49 +1490,7 @@ static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *ma
 /* Returns two connected loopback vsock sockets */
 static int vsock_socketpair_connectible(int sotype, int *v0, int *v1)
 {
-	struct sockaddr_storage addr;
-	socklen_t len = sizeof(addr);
-	int s, p, c;
-
-	s = socket_loopback(AF_VSOCK, sotype);
-	if (s < 0)
-		return -1;
-
-	c = xsocket(AF_VSOCK, sotype | SOCK_NONBLOCK, 0);
-	if (c == -1)
-		goto close_srv;
-
-	if (getsockname(s, sockaddr(&addr), &len) < 0)
-		goto close_cli;
-
-	if (connect(c, sockaddr(&addr), len) < 0 && errno != EINPROGRESS) {
-		FAIL_ERRNO("connect");
-		goto close_cli;
-	}
-
-	len = sizeof(addr);
-	p = accept_timeout(s, sockaddr(&addr), &len, IO_TIMEOUT_SEC);
-	if (p < 0)
-		goto close_cli;
-
-	if (poll_connect(c, IO_TIMEOUT_SEC) < 0) {
-		FAIL_ERRNO("poll_connect");
-		goto close_acc;
-	}
-
-	*v0 = p;
-	*v1 = c;
-
-	return 0;
-
-close_acc:
-	close(p);
-close_cli:
-	close(c);
-close_srv:
-	close(s);
-
-	return -1;
+	return create_pair(AF_VSOCK, sotype | SOCK_NONBLOCK, v0, v1);
 }
 
 static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd,
@@ -1681,44 +1639,7 @@ static void test_reuseport(struct test_sockmap_listen *skel,
 
 static int inet_socketpair(int family, int type, int *s, int *c)
 {
-	struct sockaddr_storage addr;
-	socklen_t len;
-	int p0, c0;
-	int err;
-
-	p0 = socket_loopback(family, type | SOCK_NONBLOCK);
-	if (p0 < 0)
-		return p0;
-
-	len = sizeof(addr);
-	err = xgetsockname(p0, sockaddr(&addr), &len);
-	if (err)
-		goto close_peer0;
-
-	c0 = xsocket(family, type | SOCK_NONBLOCK, 0);
-	if (c0 < 0) {
-		err = c0;
-		goto close_peer0;
-	}
-	err = xconnect(c0, sockaddr(&addr), len);
-	if (err)
-		goto close_cli0;
-	err = xgetsockname(c0, sockaddr(&addr), &len);
-	if (err)
-		goto close_cli0;
-	err = xconnect(p0, sockaddr(&addr), len);
-	if (err)
-		goto close_cli0;
-
-	*s = p0;
-	*c = c0;
-	return 0;
-
-close_cli0:
-	xclose(c0);
-close_peer0:
-	xclose(p0);
-	return err;
+	return create_pair(family, type | SOCK_NONBLOCK, s, c);
 }
 
 static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
-- 
cgit v1.2.3


From b3b15b7a1e8de773c82f81c97904b51d4e919faa Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Wed, 31 Jul 2024 12:01:29 +0200
Subject: selftests/bpf: Honour the sotype of af_unix redir tests

Do actually test the sotype as specified by the caller.

This picks up after commit 75e0e27db6cf ("selftest/bpf: Change udp to inet
in some function names").

Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-4-08a0c73abed2@rbox.co
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index ea2faacd146d..7ed223df5f12 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1706,11 +1706,11 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
 	int sfd[2];
 	int err;
 
-	if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd))
+	if (socketpair(AF_UNIX, type | SOCK_NONBLOCK, 0, sfd))
 		return;
 	c0 = sfd[0], p0 = sfd[1];
 
-	err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1);
+	err = inet_socketpair(family, type, &p1, &c1);
 	if (err)
 		goto close;
 
@@ -1758,7 +1758,7 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
 	int sfd[2];
 	int err;
 
-	err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0);
+	err = inet_socketpair(family, type, &p0, &c0);
 	if (err)
 		return;
 
-- 
cgit v1.2.3


From c9c70b28face7b156960f53649bec9ace5601b85 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Wed, 31 Jul 2024 12:01:30 +0200
Subject: selftests/bpf: Exercise SOCK_STREAM unix_inet_redir_to_connected()

Constants got switched reducing the test's coverage. Replace SOCK_DGRAM
with SOCK_STREAM in one of unix_inet_skb_redir_to_connected() tests.

Fixes: 51354f700d40 ("bpf, sockmap: Add af_unix test with both sockets in map")
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-5-08a0c73abed2@rbox.co
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index 7ed223df5f12..da5a6fb03b69 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1793,7 +1793,7 @@ static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel,
 	unix_inet_redir_to_connected(family, SOCK_DGRAM,
 				     sock_map, -1, verdict_map,
 				     REDIR_EGRESS, NO_FLAGS);
-	unix_inet_redir_to_connected(family, SOCK_DGRAM,
+	unix_inet_redir_to_connected(family, SOCK_STREAM,
 				     sock_map, -1, verdict_map,
 				     REDIR_EGRESS, NO_FLAGS);
 
-- 
cgit v1.2.3


From 86149b4f5a2d535279096946b6ef8d41911a9b5a Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Wed, 31 Jul 2024 12:01:31 +0200
Subject: selftests/bpf: Introduce __attribute__((cleanup)) in create_pair()

Rewrite function to have (unneeded) socket descriptors automatically
close()d when leaving the scope. Make sure the "ownership" of fds is
correctly passed via take_fd(); i.e. descriptor returned to caller will
remain valid.

Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-6-08a0c73abed2@rbox.co
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/sockmap_helpers.h     | 61 +++++++++++++---------
 1 file changed, 36 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
index ead8ea4fd0da..38e35c72bdaa 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
@@ -17,6 +17,17 @@
 
 #define __always_unused	__attribute__((__unused__))
 
+/* include/linux/cleanup.h */
+#define __get_and_null(p, nullvalue)                                           \
+	({                                                                     \
+		__auto_type __ptr = &(p);                                      \
+		__auto_type __val = *__ptr;                                    \
+		*__ptr = nullvalue;                                            \
+		__val;                                                         \
+	})
+
+#define take_fd(fd) __get_and_null(fd, -EBADF)
+
 #define _FAIL(errnum, fmt...)                                                  \
 	({                                                                     \
 		error_at_line(0, (errnum), __func__, __LINE__, fmt);           \
@@ -182,6 +193,14 @@
 		__ret;                                                         \
 	})
 
+static inline void close_fd(int *fd)
+{
+	if (*fd >= 0)
+		xclose(*fd);
+}
+
+#define __close_fd __attribute__((cleanup(close_fd)))
+
 static inline int poll_connect(int fd, unsigned int timeout_sec)
 {
 	struct timeval timeout = { .tv_sec = timeout_sec };
@@ -369,9 +388,10 @@ static inline int socket_loopback(int family, int sotype)
 
 static inline int create_pair(int family, int sotype, int *p0, int *p1)
 {
+	__close_fd int s, c = -1, p = -1;
 	struct sockaddr_storage addr;
 	socklen_t len = sizeof(addr);
-	int s, c, p, err;
+	int err;
 
 	s = socket_loopback(family, sotype);
 	if (s < 0)
@@ -379,25 +399,23 @@ static inline int create_pair(int family, int sotype, int *p0, int *p1)
 
 	err = xgetsockname(s, sockaddr(&addr), &len);
 	if (err)
-		goto close_s;
+		return err;
 
 	c = xsocket(family, sotype, 0);
-	if (c < 0) {
-		err = c;
-		goto close_s;
-	}
+	if (c < 0)
+		return c;
 
 	err = connect(c, sockaddr(&addr), len);
 	if (err) {
 		if (errno != EINPROGRESS) {
 			FAIL_ERRNO("connect");
-			goto close_c;
+			return err;
 		}
 
 		err = poll_connect(c, IO_TIMEOUT_SEC);
 		if (err) {
 			FAIL_ERRNO("poll_connect");
-			goto close_c;
+			return err;
 		}
 	}
 
@@ -405,36 +423,29 @@ static inline int create_pair(int family, int sotype, int *p0, int *p1)
 	case SOCK_DGRAM:
 		err = xgetsockname(c, sockaddr(&addr), &len);
 		if (err)
-			goto close_c;
+			return err;
 
 		err = xconnect(s, sockaddr(&addr), len);
-		if (!err) {
-			*p0 = s;
-			*p1 = c;
+		if (err)
 			return err;
-		}
+
+		*p0 = take_fd(s);
 		break;
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		p = xaccept_nonblock(s, NULL, NULL);
-		if (p >= 0) {
-			*p0 = p;
-			*p1 = c;
-			goto close_s;
-		}
+		if (p < 0)
+			return p;
 
-		err = p;
+		*p0 = take_fd(p);
 		break;
 	default:
 		FAIL("Unsupported socket type %#x", sotype);
-		err = -EOPNOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
-close_c:
-	close(c);
-close_s:
-	close(s);
-	return err;
+	*p1 = take_fd(c);
+	return 0;
 }
 
 static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1,
-- 
cgit v1.2.3


From 6236ebe07131a7746d870f1d8eb3637a8df13e70 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 19 Aug 2024 16:46:29 -0300
Subject: perf daemon: Fix the build on more 32-bit architectures

The previous attempt fixed the build on debian:experimental-x-mipsel,
but when building on a larger set of containers I noticed it broke the
build on some other 32-bit architectures such as:

  42     7.87 ubuntu:18.04-x-arm            : FAIL gcc version 7.5.0 (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04)
    builtin-daemon.c: In function 'cmd_session_list':
    builtin-daemon.c:692:16: error: format '%llu' expects argument of type 'long long unsigned int', but argument 4 has type 'long int' [-Werror=format=]
       fprintf(out, "%c%" PRIu64,
                    ^~~~~
    builtin-daemon.c:694:13:
        csv_sep, (curr - daemon->start) / 60);
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    In file included from builtin-daemon.c:3:0:
    /usr/arm-linux-gnueabihf/include/inttypes.h:105:34: note: format string is defined here
     # define PRIu64  __PRI64_PREFIX "u"

So lets cast that time_t (32-bit/64-bit) to uint64_t to make sure it
builds everywhere.

Fixes: 4bbe6002931954bb ("perf daemon: Fix the build on 32-bit architectures")
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZsPmldtJ0D9Cua9_@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-daemon.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index 7dca39c4314b..f0568431fbd5 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -691,7 +691,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out)
 
 		fprintf(out, "%c%" PRIu64,
 			/* session up time */
-			csv_sep, (curr - daemon->start) / 60);
+			csv_sep, (uint64_t)((curr - daemon->start) / 60));
 
 		fprintf(out, "\n");
 	} else {
@@ -702,7 +702,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out)
 			fprintf(out, "  lock:    %s/lock\n",
 				daemon->base);
 			fprintf(out, "  up:      %" PRIu64 " minutes\n",
-				(curr - daemon->start) / 60);
+				(uint64_t)((curr - daemon->start) / 60));
 		}
 	}
 
@@ -730,7 +730,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out)
 
 			fprintf(out, "%c%" PRIu64,
 				/* session up time */
-				csv_sep, (curr - session->start) / 60);
+				csv_sep, (uint64_t)((curr - session->start) / 60));
 
 			fprintf(out, "\n");
 		} else {
@@ -747,7 +747,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out)
 			fprintf(out, "  ack:     %s/%s\n",
 				session->base, SESSION_ACK);
 			fprintf(out, "  up:      %" PRIu64 " minutes\n",
-				(curr - session->start) / 60);
+				(uint64_t)((curr - session->start) / 60));
 		}
 	}
 
-- 
cgit v1.2.3


From 0311507792b54069ac72e0a6c6b35c5d40aadad8 Mon Sep 17 00:00:00 2001
From: Deven Bowers <deven.desai@linux.microsoft.com>
Date: Fri, 2 Aug 2024 23:08:15 -0700
Subject: lsm: add IPE lsm

Integrity Policy Enforcement (IPE) is an LSM that provides an
complimentary approach to Mandatory Access Control than existing LSMs
today.

Existing LSMs have centered around the concept of access to a resource
should be controlled by the current user's credentials. IPE's approach,
is that access to a resource should be controlled by the system's trust
of a current resource.

The basis of this approach is defining a global policy to specify which
resource can be trusted.

Signed-off-by: Deven Bowers <deven.desai@linux.microsoft.com>
Signed-off-by: Fan Wu <wufan@linux.microsoft.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/lsm.h                           |  1 +
 security/Kconfig                                   | 11 +++---
 security/Makefile                                  |  1 +
 security/ipe/Kconfig                               | 17 +++++++++
 security/ipe/Makefile                              |  9 +++++
 security/ipe/ipe.c                                 | 42 ++++++++++++++++++++++
 security/ipe/ipe.h                                 | 16 +++++++++
 security/security.c                                |  3 +-
 .../testing/selftests/lsm/lsm_list_modules_test.c  |  3 ++
 9 files changed, 97 insertions(+), 6 deletions(-)
 create mode 100644 security/ipe/Kconfig
 create mode 100644 security/ipe/Makefile
 create mode 100644 security/ipe/ipe.c
 create mode 100644 security/ipe/ipe.h

(limited to 'tools')

diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h
index 33d8c9f4aa6b..938593dfd5da 100644
--- a/include/uapi/linux/lsm.h
+++ b/include/uapi/linux/lsm.h
@@ -64,6 +64,7 @@ struct lsm_ctx {
 #define LSM_ID_LANDLOCK		110
 #define LSM_ID_IMA		111
 #define LSM_ID_EVM		112
+#define LSM_ID_IPE		113
 
 /*
  * LSM_ATTR_XXX definitions identify different LSM attributes
diff --git a/security/Kconfig b/security/Kconfig
index 412e76f1575d..9fb8f9b14972 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -192,6 +192,7 @@ source "security/yama/Kconfig"
 source "security/safesetid/Kconfig"
 source "security/lockdown/Kconfig"
 source "security/landlock/Kconfig"
+source "security/ipe/Kconfig"
 
 source "security/integrity/Kconfig"
 
@@ -231,11 +232,11 @@ endchoice
 
 config LSM
 	string "Ordered list of enabled LSMs"
-	default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK
-	default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR
-	default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO
-	default "landlock,lockdown,yama,loadpin,safesetid,bpf" if DEFAULT_SECURITY_DAC
-	default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf"
+	default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,ipe,bpf" if DEFAULT_SECURITY_SMACK
+	default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,ipe,bpf" if DEFAULT_SECURITY_APPARMOR
+	default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,ipe,bpf" if DEFAULT_SECURITY_TOMOYO
+	default "landlock,lockdown,yama,loadpin,safesetid,ipe,bpf" if DEFAULT_SECURITY_DAC
+	default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,ipe,bpf"
 	help
 	  A comma-separated list of LSMs, in initialization order.
 	  Any LSMs left off this list, except for those with order
diff --git a/security/Makefile b/security/Makefile
index 59f238490665..cc0982214b84 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown/
 obj-$(CONFIG_CGROUPS)			+= device_cgroup.o
 obj-$(CONFIG_BPF_LSM)			+= bpf/
 obj-$(CONFIG_SECURITY_LANDLOCK)		+= landlock/
+obj-$(CONFIG_SECURITY_IPE)		+= ipe/
 
 # Object integrity file lists
 obj-$(CONFIG_INTEGRITY)			+= integrity/
diff --git a/security/ipe/Kconfig b/security/ipe/Kconfig
new file mode 100644
index 000000000000..e4875fb04883
--- /dev/null
+++ b/security/ipe/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Integrity Policy Enforcement (IPE) configuration
+#
+
+menuconfig SECURITY_IPE
+	bool "Integrity Policy Enforcement (IPE)"
+	depends on SECURITY && SECURITYFS
+	select PKCS7_MESSAGE_PARSER
+	select SYSTEM_DATA_VERIFICATION
+	help
+	  This option enables the Integrity Policy Enforcement LSM
+	  allowing users to define a policy to enforce a trust-based access
+	  control. A key feature of IPE is a customizable policy to allow
+	  admins to reconfigure trust requirements on the fly.
+
+	  If unsure, answer N.
diff --git a/security/ipe/Makefile b/security/ipe/Makefile
new file mode 100644
index 000000000000..5486398a69e9
--- /dev/null
+++ b/security/ipe/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020-2024 Microsoft Corporation. All rights reserved.
+#
+# Makefile for building the IPE module as part of the kernel tree.
+#
+
+obj-$(CONFIG_SECURITY_IPE) += \
+	ipe.o \
diff --git a/security/ipe/ipe.c b/security/ipe/ipe.c
new file mode 100644
index 000000000000..8d4ea372873e
--- /dev/null
+++ b/security/ipe/ipe.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020-2024 Microsoft Corporation. All rights reserved.
+ */
+#include <uapi/linux/lsm.h>
+
+#include "ipe.h"
+
+static struct lsm_blob_sizes ipe_blobs __ro_after_init = {
+};
+
+static const struct lsm_id ipe_lsmid = {
+	.name = "ipe",
+	.id = LSM_ID_IPE,
+};
+
+static struct security_hook_list ipe_hooks[] __ro_after_init = {
+};
+
+/**
+ * ipe_init() - Entry point of IPE.
+ *
+ * This is called at LSM init, which happens occurs early during kernel
+ * start up. During this phase, IPE registers its hooks and loads the
+ * builtin boot policy.
+ *
+ * Return:
+ * * %0		- OK
+ * * %-ENOMEM	- Out of memory (OOM)
+ */
+static int __init ipe_init(void)
+{
+	security_add_hooks(ipe_hooks, ARRAY_SIZE(ipe_hooks), &ipe_lsmid);
+
+	return 0;
+}
+
+DEFINE_LSM(ipe) = {
+	.name = "ipe",
+	.init = ipe_init,
+	.blobs = &ipe_blobs,
+};
diff --git a/security/ipe/ipe.h b/security/ipe/ipe.h
new file mode 100644
index 000000000000..adc3c45e9f53
--- /dev/null
+++ b/security/ipe/ipe.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2024 Microsoft Corporation. All rights reserved.
+ */
+
+#ifndef _IPE_H
+#define _IPE_H
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) "ipe: " fmt
+
+#include <linux/lsm_hooks.h>
+
+#endif /* _IPE_H */
diff --git a/security/security.c b/security/security.c
index 611d3c124ba6..645a660320cb 100644
--- a/security/security.c
+++ b/security/security.c
@@ -53,7 +53,8 @@
 	(IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0) + \
 	(IS_ENABLED(CONFIG_SECURITY_LANDLOCK) ? 1 : 0) + \
 	(IS_ENABLED(CONFIG_IMA) ? 1 : 0) + \
-	(IS_ENABLED(CONFIG_EVM) ? 1 : 0))
+	(IS_ENABLED(CONFIG_EVM) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_IPE) ? 1 : 0))
 
 /*
  * These are descriptions of the reasons that can be passed to the
diff --git a/tools/testing/selftests/lsm/lsm_list_modules_test.c b/tools/testing/selftests/lsm/lsm_list_modules_test.c
index 06d24d4679a6..1cc8a977c711 100644
--- a/tools/testing/selftests/lsm/lsm_list_modules_test.c
+++ b/tools/testing/selftests/lsm/lsm_list_modules_test.c
@@ -128,6 +128,9 @@ TEST(correct_lsm_list_modules)
 		case LSM_ID_EVM:
 			name = "evm";
 			break;
+		case LSM_ID_IPE:
+			name = "ipe";
+			break;
 		default:
 			name = "INVALID";
 			break;
-- 
cgit v1.2.3


From 8b1042c425f6a5a9fb57c5e1d5ecf5247cf899a6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 15 Aug 2024 15:38:23 -0700
Subject: perf annotate-data: Set bitfield member offset and size properly

The bitfield members might not have DW_AT_data_member_location.  Let's
use DW_AT_data_bit_offset to set the member offset correct.  Also use
DW_AT_bit_size for the name like in a C program.

Before:
  Annotate type: 'struct sk_buff' (1 samples)
        Percent     Offset       Size  Field
  -      100.00          0        232  struct sk_buff {
  +        0.00          0         24      union  ;
  +        0.00         24          8      union  ;
  +        0.00         32          8      union  ;
           0.00         40         48      char[] cb;
  +        0.00         88         16      union  ;
           0.00        104          8      long unsigned int      _nfct;
         100.00        112          4      unsigned int   len;
           0.00        116          4      unsigned int   data_len;
           0.00        120          2      __u16  mac_len;
           0.00        122          2      __u16  hdr_len;
           0.00        124          2      __u16  queue_mapping;
           0.00        126          0      __u8[] __cloned_offset;
           0.00          0          1      __u8   cloned;
           0.00          0          1      __u8   nohdr;
           0.00          0          1      __u8   fclone;
           0.00          0          1      __u8   peeked;
           0.00          0          1      __u8   head_frag;
           0.00          0          1      __u8   pfmemalloc;
           0.00          0          1      __u8   pp_recycle;
           0.00        127          1      __u8   active_extensions;
  +        0.00        128         60      union  ;
           0.00        188          4      sk_buff_data_t tail;
           0.00        192          4      sk_buff_data_t end;
           0.00        200          8      unsigned char* head;

After:

  Annotate type: 'struct sk_buff' (1 samples)
        Percent     Offset       Size  Field
  -      100.00          0        232  struct sk_buff {
  +        0.00          0         24      union  ;
  +        0.00         24          8      union  ;
  +        0.00         32          8      union  ;
           0.00         40         48      char[] cb
  +        0.00         88         16      union  ;
           0.00        104          8      long unsigned int      _nfct;
         100.00        112          4      unsigned int   len;
           0.00        116          4      unsigned int   data_len;
           0.00        120          2      __u16  mac_len;
           0.00        122          2      __u16  hdr_len;
           0.00        124          2      __u16  queue_mapping;
           0.00        126          0      __u8[] __cloned_offset;
           0.00        126          1      __u8   cloned:1;
           0.00        126          1      __u8   nohdr:1;
           0.00        126          1      __u8   fclone:2;
           0.00        126          1      __u8   peeked:1;
           0.00        126          1      __u8   head_frag:1;
           0.00        126          1      __u8   pfmemalloc:1;
           0.00        126          1      __u8   pp_recycle:1;
           0.00        127          1      __u8   active_extensions;
  +        0.00        128         60      union  ;
           0.00        188          4      sk_buff_data_t tail;
           0.00        192          4      sk_buff_data_t end;
           0.00        200          8      unsigned char* head;

Commiter notes:

Collect some data:

  root@number:~# perf mem record -a --ldlat 5 -- ping -s 8193 -f 192.168.86.1
  Memory events are enabled on a subset of CPUs: 16-27
  PING 192.168.86.1 (192.168.86.1) 8193(8221) bytes of data.
  .^C
  --- 192.168.86.1 ping statistics ---
  13881 packets transmitted, 13880 received, 0.00720409% packet loss, time 8664ms
  rtt min/avg/max/mdev = 0.510/0.599/7.768/0.115 ms, ipg/ewma 0.624/0.593 ms
  [ perf record: Woken up 8 times to write data ]
  [ perf record: Captured and wrote 14.877 MB perf.data (46785 samples) ]

  root@number:~#
  root@number:~# perf evlist
  cpu_atom/mem-loads,ldlat=5/P
  cpu_atom/mem-stores/P
  dummy:u
  root@number:~# perf evlist -v
  cpu_atom/mem-loads,ldlat=5/P: type: 10 (cpu_atom), size: 136, config: 0x5d0 (mem-loads), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1, { bp_addr, config1 }: 0x7
  cpu_atom/mem-stores/P: type: 10 (cpu_atom), size: 136, config: 0x6d0 (mem-stores), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  dummy:u: type: 1 (software), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|ADDR|CPU|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
  root@number:~#

Ok, now lets see what changes from before this patch to after it:

  root@number:~# perf annotate --data-type > /tmp/before

Apply the patch, build:

  root@number:~# perf annotate --data-type > /tmp/after

The first hunk of the diff, for a glib data structure, in userspace,
look at those bitfields:

  root@number:~# diff -u10 /tmp/before /tmp/after | head -20
  --- /tmp/before	2024-08-20 17:29:58.306765780 -0300
  +++ /tmp/after	2024-08-20 17:33:13.210582596 -0300
  @@ -163,22 +163,22 @@

   Annotate type: 'GHashTable' in /usr/lib64/libglib-2.0.so.0.8000.3 (1 samples):
   ============================================================================
    Percent     offset       size  field
     100.00          0         96  GHashTable	 {
       0.00          0          8      gsize	size;
       0.00          8          4      gint	mod;
     100.00         12          4      guint	mask;
       0.00         16          4      guint	nnodes;
       0.00         20          4      guint	noccupied;
  -    0.00          0          4      guint	have_big_keys;
  -    0.00          0          4      guint	have_big_values;
  +    0.00         24          1      guint	have_big_keys:1;
  +    0.00         24          1      guint	have_big_values:1;
       0.00         32          8      gpointer	keys;
       0.00         40          8      guint*	hashes;
       0.00         48          8      gpointer	values;
  root@number:~#

As advertised :-)

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240815223823.2402285-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index aa330c7d8edd..e5589268cb42 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -202,7 +202,7 @@ static int __add_member_cb(Dwarf_Die *die, void *arg)
 	struct annotated_member *parent = arg;
 	struct annotated_member *member;
 	Dwarf_Die member_type, die_mem;
-	Dwarf_Word size, loc;
+	Dwarf_Word size, loc, bit_size = 0;
 	Dwarf_Attribute attr;
 	struct strbuf sb;
 	int tag;
@@ -226,15 +226,37 @@ static int __add_member_cb(Dwarf_Die *die, void *arg)
 	if (dwarf_aggregate_size(&die_mem, &size) < 0)
 		size = 0;
 
-	if (!dwarf_attr_integrate(die, DW_AT_data_member_location, &attr))
-		loc = 0;
-	else
+	if (dwarf_attr_integrate(die, DW_AT_data_member_location, &attr))
 		dwarf_formudata(&attr, &loc);
+	else {
+		/* bitfield member */
+		if (dwarf_attr_integrate(die, DW_AT_data_bit_offset, &attr) &&
+		    dwarf_formudata(&attr, &loc) == 0)
+			loc /= 8;
+		else
+			loc = 0;
+
+		if (dwarf_attr_integrate(die, DW_AT_bit_size, &attr) &&
+		    dwarf_formudata(&attr, &bit_size) == 0)
+			size = (bit_size + 7) / 8;
+	}
 
 	member->type_name = strbuf_detach(&sb, NULL);
 	/* member->var_name can be NULL */
-	if (dwarf_diename(die))
-		member->var_name = strdup(dwarf_diename(die));
+	if (dwarf_diename(die)) {
+		if (bit_size) {
+			if (asprintf(&member->var_name, "%s:%ld",
+				     dwarf_diename(die), (long)bit_size) < 0)
+				member->var_name = NULL;
+		} else {
+			member->var_name = strdup(dwarf_diename(die));
+		}
+
+		if (member->var_name == NULL) {
+			free(member);
+			return DIE_FIND_CB_END;
+		}
+	}
 	member->size = size;
 	member->offset = loc + parent->offset;
 	INIT_LIST_HEAD(&member->children);
-- 
cgit v1.2.3


From e25ebda78e230283bf707ae3e9655270ff40a7f9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 6 Aug 2024 15:06:14 -0700
Subject: perf cap: Tidy up and improve capability testing

Remove dependence on libcap. libcap is only used to query whether a
capability is supported, which is just 1 capget system call.

If the capget system call fails, fall back on root permission
checking. Previously if libcap fails then the permission is assumed
not present which may be pessimistic/wrong.

Add a used_root out argument to perf_cap__capable to say whether the
fall back root check was used. This allows the correct error message,
"root" vs "users with the CAP_PERFMON or CAP_SYS_ADMIN capability", to
be selected.

Tidy uses of perf_cap__capable so that tests aren't repeated if capget
isn't supported.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240806220614.831914-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config  | 11 --------
 tools/perf/builtin-ftrace.c | 28 ++++++++++----------
 tools/perf/util/Build       |  2 +-
 tools/perf/util/cap.c       | 63 +++++++++++++++++++++++++++++++--------------
 tools/perf/util/cap.h       | 23 +++--------------
 tools/perf/util/symbol.c    |  8 +++---
 tools/perf/util/util.c      | 12 ++++++---
 7 files changed, 75 insertions(+), 72 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index fa679db61f62..4eb1fc897baf 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -1031,17 +1031,6 @@ ifndef NO_LIBZSTD
   endif
 endif
 
-ifndef NO_LIBCAP
-  ifeq ($(feature-libcap), 1)
-    CFLAGS += -DHAVE_LIBCAP_SUPPORT
-    EXTLIBS += -lcap
-    $(call detected,CONFIG_LIBCAP)
-  else
-    $(warning No libcap found, disables capability support, please install libcap-devel/libcap-dev)
-    NO_LIBCAP := 1
-  endif
-endif
-
 ifndef NO_BACKTRACE
   ifeq ($(feature-backtrace), 1)
     CFLAGS += -DHAVE_BACKTRACE_SUPPORT
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index a615c405d98f..88a87bf387d2 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -63,20 +63,21 @@ static void ftrace__workload_exec_failed_signal(int signo __maybe_unused,
 	done = true;
 }
 
-static int check_ftrace_capable(void)
+static bool check_ftrace_capable(void)
 {
-	if (!(perf_cap__capable(CAP_PERFMON) ||
-	      perf_cap__capable(CAP_SYS_ADMIN))) {
-		pr_err("ftrace only works for %s!\n",
-#ifdef HAVE_LIBCAP_SUPPORT
-		"users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
-#else
-		"root"
-#endif
+	bool used_root;
+
+	if (perf_cap__capable(CAP_PERFMON, &used_root))
+		return true;
+
+	if (!used_root && perf_cap__capable(CAP_SYS_ADMIN, &used_root))
+		return true;
+
+	pr_err("ftrace only works for %s!\n",
+		used_root ? "root"
+			  : "users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
 		);
-		return -1;
-	}
-	return 0;
+	return false;
 }
 
 static int __write_tracing_file(const char *name, const char *val, bool append)
@@ -1579,8 +1580,7 @@ int cmd_ftrace(int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGPIPE, sig_handler);
 
-	ret = check_ftrace_capable();
-	if (ret < 0)
+	if (!check_ftrace_capable())
 		return -1;
 
 	ret = perf_config(perf_ftrace_config, &ftrace);
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 7ea261416c14..b87f918bdfe7 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -223,7 +223,7 @@ perf-util-$(CONFIG_ZLIB) += zlib.o
 perf-util-$(CONFIG_LZMA) += lzma.o
 perf-util-$(CONFIG_ZSTD) += zstd.o
 
-perf-util-$(CONFIG_LIBCAP) += cap.o
+perf-util-y += cap.o
 
 perf-util-$(CONFIG_CXX_DEMANGLE) += demangle-cxx.o
 perf-util-y += demangle-ocaml.o
diff --git a/tools/perf/util/cap.c b/tools/perf/util/cap.c
index c3ba841bbf37..7574a67651bc 100644
--- a/tools/perf/util/cap.c
+++ b/tools/perf/util/cap.c
@@ -3,27 +3,52 @@
  * Capability utilities
  */
 
-#ifdef HAVE_LIBCAP_SUPPORT
-
 #include "cap.h"
-#include <stdbool.h>
-#include <sys/capability.h>
-
-bool perf_cap__capable(cap_value_t cap)
-{
-	cap_flag_value_t val;
-	cap_t caps = cap_get_proc();
+#include "debug.h"
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/capability.h>
+#include <sys/syscall.h>
 
-	if (!caps)
-		return false;
+#ifndef SYS_capget
+#define SYS_capget 90
+#endif
 
-	if (cap_get_flag(caps, cap, CAP_EFFECTIVE, &val) != 0)
-		val = CAP_CLEAR;
+#define MAX_LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
 
-	if (cap_free(caps) != 0)
-		return false;
-
-	return val == CAP_SET;
+bool perf_cap__capable(int cap, bool *used_root)
+{
+	struct __user_cap_header_struct header = {
+		.version = _LINUX_CAPABILITY_VERSION_3,
+		.pid = getpid(),
+	};
+	struct __user_cap_data_struct data[MAX_LINUX_CAPABILITY_U32S];
+	__u32 cap_val;
+
+	*used_root = false;
+	while (syscall(SYS_capget, &header, &data[0]) == -1) {
+		/* Retry, first attempt has set the header.version correctly. */
+		if (errno == EINVAL && header.version != _LINUX_CAPABILITY_VERSION_3 &&
+		    header.version == _LINUX_CAPABILITY_VERSION_1)
+			continue;
+
+		pr_debug2("capget syscall failed (%s - %d) fall back on root check\n",
+			  strerror(errno), errno);
+		*used_root = true;
+		return geteuid() == 0;
+	}
+
+	/* Extract the relevant capability bit. */
+	if (cap >= 32) {
+		if (header.version == _LINUX_CAPABILITY_VERSION_3) {
+			cap_val = data[1].effective;
+		} else {
+			/* Capability beyond 32 is requested but only 32 are supported. */
+			return false;
+		}
+	} else {
+		cap_val = data[0].effective;
+	}
+	return (cap_val & (1 << (cap & 0x1f))) != 0;
 }
-
-#endif  /* HAVE_LIBCAP_SUPPORT */
diff --git a/tools/perf/util/cap.h b/tools/perf/util/cap.h
index ae52878c0b2e..0c6a1ff55f07 100644
--- a/tools/perf/util/cap.h
+++ b/tools/perf/util/cap.h
@@ -3,26 +3,6 @@
 #define __PERF_CAP_H
 
 #include <stdbool.h>
-#include <linux/capability.h>
-#include <linux/compiler.h>
-
-#ifdef HAVE_LIBCAP_SUPPORT
-
-#include <sys/capability.h>
-
-bool perf_cap__capable(cap_value_t cap);
-
-#else
-
-#include <unistd.h>
-#include <sys/types.h>
-
-static inline bool perf_cap__capable(int cap __maybe_unused)
-{
-	return geteuid() == 0;
-}
-
-#endif /* HAVE_LIBCAP_SUPPORT */
 
 /* For older systems */
 #ifndef CAP_SYSLOG
@@ -33,4 +13,7 @@ static inline bool perf_cap__capable(int cap __maybe_unused)
 #define CAP_PERFMON	38
 #endif
 
+/* Query if a capability is supported, used_root is set if the fallback root check was used. */
+bool perf_cap__capable(int cap, bool *used_root);
+
 #endif /* __PERF_CAP_H */
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 19eb623e0826..a18927d792af 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -2425,14 +2425,14 @@ static bool symbol__read_kptr_restrict(void)
 {
 	bool value = false;
 	FILE *fp = fopen("/proc/sys/kernel/kptr_restrict", "r");
+	bool used_root;
+	bool cap_syslog = perf_cap__capable(CAP_SYSLOG, &used_root);
 
 	if (fp != NULL) {
 		char line[8];
 
 		if (fgets(line, sizeof(line), fp) != NULL)
-			value = perf_cap__capable(CAP_SYSLOG) ?
-					(atoi(line) >= 2) :
-					(atoi(line) != 0);
+			value = cap_syslog ? (atoi(line) >= 2) : (atoi(line) != 0);
 
 		fclose(fp);
 	}
@@ -2440,7 +2440,7 @@ static bool symbol__read_kptr_restrict(void)
 	/* Per kernel/kallsyms.c:
 	 * we also restrict when perf_event_paranoid > 1 w/o CAP_SYSLOG
 	 */
-	if (perf_event_paranoid() > 1 && !perf_cap__capable(CAP_SYSLOG))
+	if (perf_event_paranoid() > 1 && !cap_syslog)
 		value = true;
 
 	return value;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 4f561e5e4162..9d55a13787ce 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -325,9 +325,15 @@ int perf_event_paranoid(void)
 
 bool perf_event_paranoid_check(int max_level)
 {
-	return perf_cap__capable(CAP_SYS_ADMIN) ||
-			perf_cap__capable(CAP_PERFMON) ||
-			perf_event_paranoid() <= max_level;
+	bool used_root;
+
+	if (perf_cap__capable(CAP_SYS_ADMIN, &used_root))
+		return true;
+
+	if (!used_root && perf_cap__capable(CAP_PERFMON, &used_root))
+		return true;
+
+	return perf_event_paranoid() <= max_level;
 }
 
 static int
-- 
cgit v1.2.3


From 555e5531635a7c5dba663d06cea240362624dfde Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 17 Aug 2024 13:36:59 -0700
Subject: selftests: net/forwarding: spawn sh inside vrf to speed up ping loop

Looking at timestamped output of netdev CI reveals that
most of the time in forwarding tests for custom route
hashing is spent on a single case, namely the test which
uses ping (mausezahn does not support flow labels).

On a non-debug kernel we spend 714 of 730 total test
runtime (97%) on this test case. While having flow label
support in a traffic gen tool / mausezahn would be best,
we can significantly speed up the loop by putting ip vrf exec
outside of the iteration.

In a test of 1000 pings using a normal loop takes 50 seconds
to finish. While using:

  ip vrf exec $vrf sh -c "$loop-body"

takes 12 seconds (1/4 of the time).

Some of the slowness is likely due to our inefficient virtualization
setup, but even on my laptop running "ip link help" 16k times takes
25-30 seconds, so I think it's worth optimizing even for fastest
setups.

Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/20240817203659.712085-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/custom_multipath_hash.sh   | 8 ++++----
 .../testing/selftests/net/forwarding/gre_custom_multipath_hash.sh | 8 ++++----
 .../selftests/net/forwarding/ip6gre_custom_multipath_hash.sh      | 8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh
index 1783c10215e5..7d531f7091e6 100755
--- a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh
+++ b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh
@@ -224,10 +224,10 @@ send_dst_ipv6()
 send_flowlabel()
 {
 	# Generate 16384 echo requests, each with a random flow label.
-	for _ in $(seq 1 16384); do
-		ip vrf exec v$h1 \
-			$PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1
-	done
+	ip vrf exec v$h1 sh -c \
+		"for _ in {1..16384}; do \
+			$PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1; \
+		done"
 }
 
 send_src_udp6()
diff --git a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh
index 9788bd0f6e8b..dda11a4a9450 100755
--- a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh
+++ b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh
@@ -319,10 +319,10 @@ send_dst_ipv6()
 send_flowlabel()
 {
 	# Generate 16384 echo requests, each with a random flow label.
-	for _ in $(seq 1 16384); do
-		ip vrf exec v$h1 \
-			$PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1
-	done
+	ip vrf exec v$h1 sh -c \
+		"for _ in {1..16384}; do \
+			$PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1; \
+		done"
 }
 
 send_src_udp6()
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh
index 2ab9eaaa5532..e28b4a079e52 100755
--- a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh
@@ -321,10 +321,10 @@ send_dst_ipv6()
 send_flowlabel()
 {
 	# Generate 16384 echo requests, each with a random flow label.
-	for _ in $(seq 1 16384); do
-		ip vrf exec v$h1 \
-			$PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1
-	done
+	ip vrf exec v$h1 sh -c \
+		"for _ in {1..16384}; do \
+			$PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1; \
+		done"
 }
 
 send_src_udp6()
-- 
cgit v1.2.3


From 99338cc1e47187c3efdd39cda2a31500d0f2305d Mon Sep 17 00:00:00 2001
From: Piotr Zalewski <pZ010001011111@proton.me>
Date: Mon, 19 Aug 2024 17:58:34 +0000
Subject: kselftest: timers: Fix const correctness

Make timespec pointers, pointers to const in checklist function. As a
consequence, make list parameter in checklist function pointer to const
as well. Const-correctness increases readability.

Improvement was found by running cppcheck tool on the patched file as
follows:
```
cppcheck --enable=all \
	tools/testing/selftests/timers/threadtest.c \
	--suppress=missingIncludeSystem \
	--suppress=unusedFunction
```

Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Piotr Zalewski <pZ010001011111@proton.me>
Acked-by: John Stultz <jstultz@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/threadtest.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c
index 76b38e41d9c7..d5564bbf0e50 100644
--- a/tools/testing/selftests/timers/threadtest.c
+++ b/tools/testing/selftests/timers/threadtest.c
@@ -38,10 +38,10 @@ struct timespec global_list[LISTSIZE];
 int listcount = 0;
 
 
-void checklist(struct timespec *list, int size)
+void checklist(const struct timespec *list, int size)
 {
 	int i, j;
-	struct timespec *a, *b;
+	const struct timespec *a, *b;
 
 	/* scan the list */
 	for (i = 0; i < size-1; i++) {
-- 
cgit v1.2.3


From c049acee3c71cfc26c739f82617a84e13e471a45 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 15 May 2024 01:36:20 -0400
Subject: selftests/ftrace: Fix test to handle both old and new kernels

The function "scheduler_tick" was renamed to "sched_tick" and a selftest
that used that function for testing function trace filtering used that
function as part of the test.

But the change causes it to fail when run on older kernels. As tests
should not fail on older kernels, add a check to see which name is
available before testing.

Fixes: 86dd6c04ef9f ("sched/balancing: Rename scheduler_tick() => sched_tick()")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc       | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
index 073a748b9380..263f6b798c85 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
@@ -19,7 +19,14 @@ fail() { # mesg
 
 FILTER=set_ftrace_filter
 FUNC1="schedule"
-FUNC2="sched_tick"
+if grep '^sched_tick\b' available_filter_functions; then
+    FUNC2="sched_tick"
+elif grep '^scheduler_tick\b' available_filter_functions; then
+    FUNC2="scheduler_tick"
+else
+    exit_unresolved
+fi
+
 
 ALL_FUNCS="#### all functions enabled ####"
 
-- 
cgit v1.2.3


From 922ec313f061cf75157d8e93fe16bc6397f6ea25 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 20 Aug 2024 23:54:06 -0700
Subject: perf annotate-data: Fix missing constant copy

I found it missed to copy the immediate constant when it moves the
register value.  This could result in a wrong type inference since the
address for the per-cpu variable would be 0 always.

Fixes: eb9190afaed6afd5 ("perf annotate-data: Handle ADD instructions")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240821065408.285548-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/annotate/instructions.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
index 7b7d462c6c6b..88b5bcf2116f 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -382,6 +382,7 @@ static void update_insn_state_x86(struct type_state *state,
 
 		tsr->type = state->regs[src->reg1].type;
 		tsr->kind = state->regs[src->reg1].kind;
+		tsr->imm_value = state->regs[src->reg1].imm_value;
 		tsr->ok = true;
 
 		pr_debug_dtp("mov [%x] reg%d -> reg%d",
-- 
cgit v1.2.3


From 4a32a97268d304d5708f1212c4a4d0e1dd3246dd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 20 Aug 2024 23:54:07 -0700
Subject: perf annotate-data: Prefer struct/union over base type

Sometimes a compound type can have a single field and the size is the
same as the base type.  But it's still preferred as struct or union
could carry more information than the base type.

Also put a slight priority on the typedef for the same reason.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240821065408.285548-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index e5589268cb42..bc65264084d8 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -404,6 +404,13 @@ static bool is_pointer_type(Dwarf_Die *type_die)
 	return tag == DW_TAG_pointer_type || tag == DW_TAG_array_type;
 }
 
+static bool is_compound_type(Dwarf_Die *type_die)
+{
+	int tag = dwarf_tag(type_die);
+
+	return tag == DW_TAG_structure_type || tag == DW_TAG_union_type;
+}
+
 /* returns if Type B has better information than Type A */
 static bool is_better_type(Dwarf_Die *type_a, Dwarf_Die *type_b)
 {
@@ -433,7 +440,18 @@ static bool is_better_type(Dwarf_Die *type_a, Dwarf_Die *type_b)
 	    dwarf_aggregate_size(type_b, &size_b) < 0)
 		return false;
 
-	return size_a < size_b;
+	if (size_a != size_b)
+		return size_a < size_b;
+
+	/* struct or union is preferred */
+	if (is_compound_type(type_a) != is_compound_type(type_b))
+		return is_compound_type(type_b);
+
+	/* typedef is preferred */
+	if (dwarf_tag(type_b) == DW_TAG_typedef)
+		return true;
+
+	return false;
 }
 
 /* The type info will be saved in @type_die */
-- 
cgit v1.2.3


From 4d6d6e0f61e2267103e9b013d2a82d04ff278127 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 20 Aug 2024 23:54:08 -0700
Subject: perf annotate-data: Fix percpu pointer check

In check_matching_type(), it checks the type state of the register in a
wrong order.  When it's the percpu pointer, it should check the type for
the pointer, but it checks the CFA bit first and thought it has no type
in the stack slot.  This resulted in no type info.

  -----------------------------------------------------------
  find data type for 0x28(reg1) at hrtimer_reprogram+0x88
  CU for kernel/time/hrtimer.c (die:0x18f219f)
  frame base: cfa=1 fbreg=7
  ...
  add [72] percpu 0x24500 -> reg1 pointer type='struct hrtimer_cpu_base' size=0x240 (die:0x18f6d46)
  bb: [7a - 7e]
  bb: [80 - 86]                        (here)
  bb: [88 - 88]                         vvv
  chk [88] reg1 offset=0x28 ok=1 kind=4 cfa : no type information
  no type information

Here, instruction at 0x72 found reg1 has a (percpu) pointer and got the
correct type.  But when it checks the final result, it wrongly thought
it was stack variable because it checks the cfa bit first.

After changing the order of state check:
  -----------------------------------------------------------
  find data type for 0x28(reg1) at hrtimer_reprogram+0x88
  CU for kernel/time/hrtimer.c (die:0x18f219f)
  frame base: cfa=1 fbreg=7
  ...                                     (here)
                                        vvvvvvvvvv
  chk [88] reg1 offset=0x28 ok=1 kind=4 percpu ptr : Good!
  found by insn track: 0x28(reg1) type-offset=0x28
  final type: type='struct hrtimer_cpu_base' size=0x240 (die:0x18f6d46)

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240821065408.285548-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/annotate/instructions.c |   3 +
 tools/perf/util/annotate-data.c             | 122 ++++++++++++++--------------
 2 files changed, 66 insertions(+), 59 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
index 88b5bcf2116f..15dfc2988e24 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -282,6 +282,7 @@ static void update_insn_state_x86(struct type_state *state,
 			    !strcmp(var_name, "this_cpu_off") &&
 			    tsr->kind == TSR_KIND_CONST) {
 				tsr->kind = TSR_KIND_PERCPU_BASE;
+				tsr->ok = true;
 				imm_value = tsr->imm_value;
 			}
 		}
@@ -533,9 +534,11 @@ retry:
 							&var_name, &offset) &&
 				    !strcmp(var_name, "__per_cpu_offset")) {
 					tsr->kind = TSR_KIND_PERCPU_BASE;
+					tsr->ok = true;
 
 					pr_debug_dtp("mov [%x] percpu base reg%d\n",
 						     insn_offset, dst->reg1);
+					return;
 				}
 			}
 
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index bc65264084d8..f5eefcb71c4f 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -965,7 +965,10 @@ static enum type_match_result check_matching_type(struct type_state *state,
 		     insn_offset, reg, dloc->op->offset,
 		     state->regs[reg].ok, state->regs[reg].kind);
 
-	if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) {
+	if (!state->regs[reg].ok)
+		goto check_non_register;
+
+	if (state->regs[reg].kind == TSR_KIND_TYPE) {
 		Dwarf_Die sized_type;
 
 		/*
@@ -998,6 +1001,65 @@ static enum type_match_result check_matching_type(struct type_state *state,
 		return PERF_TMR_OK;
 	}
 
+	if (state->regs[reg].kind == TSR_KIND_POINTER) {
+		pr_debug_dtp("percpu ptr");
+
+		/*
+		 * It's actaully pointer but the address was calculated using
+		 * some arithmetic.  So it points to the actual type already.
+		 */
+		*type_die = state->regs[reg].type;
+
+		dloc->type_offset = dloc->op->offset;
+
+		/* Get the size of the actual type */
+		if (dwarf_aggregate_size(type_die, &size) < 0 ||
+		    (unsigned)dloc->type_offset >= size)
+			return PERF_TMR_BAIL_OUT;
+
+		return PERF_TMR_OK;
+	}
+
+	if (state->regs[reg].kind == TSR_KIND_CANARY) {
+		pr_debug_dtp("stack canary");
+
+		/*
+		 * This is a saved value of the stack canary which will be handled
+		 * in the outer logic when it returns failure here.  Pretend it's
+		 * from the stack canary directly.
+		 */
+		setup_stack_canary(dloc);
+
+		return PERF_TMR_BAIL_OUT;
+	}
+
+	if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) {
+		u64 var_addr = dloc->op->offset;
+		int var_offset;
+
+		pr_debug_dtp("percpu var");
+
+		if (dloc->op->multi_regs) {
+			int reg2 = dloc->op->reg2;
+
+			if (dloc->op->reg2 == reg)
+				reg2 = dloc->op->reg1;
+
+			if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+			    state->regs[reg2].kind == TSR_KIND_CONST)
+				var_addr += state->regs[reg2].imm_value;
+		}
+
+		if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
+					&var_offset, type_die)) {
+			dloc->type_offset = var_offset;
+			return PERF_TMR_OK;
+		}
+		/* No need to retry per-cpu (global) variables */
+		return PERF_TMR_BAIL_OUT;
+	}
+
+check_non_register:
 	if (reg == dloc->fbreg) {
 		struct type_state_stack *stack;
 
@@ -1054,64 +1116,6 @@ static enum type_match_result check_matching_type(struct type_state *state,
 		return PERF_TMR_OK;
 	}
 
-	if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) {
-		u64 var_addr = dloc->op->offset;
-		int var_offset;
-
-		pr_debug_dtp("percpu var");
-
-		if (dloc->op->multi_regs) {
-			int reg2 = dloc->op->reg2;
-
-			if (dloc->op->reg2 == reg)
-				reg2 = dloc->op->reg1;
-
-			if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
-			    state->regs[reg2].kind == TSR_KIND_CONST)
-				var_addr += state->regs[reg2].imm_value;
-		}
-
-		if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
-					&var_offset, type_die)) {
-			dloc->type_offset = var_offset;
-			return PERF_TMR_OK;
-		}
-		/* No need to retry per-cpu (global) variables */
-		return PERF_TMR_BAIL_OUT;
-	}
-
-	if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_POINTER) {
-		pr_debug_dtp("percpu ptr");
-
-		/*
-		 * It's actaully pointer but the address was calculated using
-		 * some arithmetic.  So it points to the actual type already.
-		 */
-		*type_die = state->regs[reg].type;
-
-		dloc->type_offset = dloc->op->offset;
-
-		/* Get the size of the actual type */
-		if (dwarf_aggregate_size(type_die, &size) < 0 ||
-		    (unsigned)dloc->type_offset >= size)
-			return PERF_TMR_BAIL_OUT;
-
-		return PERF_TMR_OK;
-	}
-
-	if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_CANARY) {
-		pr_debug_dtp("stack canary");
-
-		/*
-		 * This is a saved value of the stack canary which will be handled
-		 * in the outer logic when it returns failure here.  Pretend it's
-		 * from the stack canary directly.
-		 */
-		setup_stack_canary(dloc);
-
-		return PERF_TMR_BAIL_OUT;
-	}
-
 check_kernel:
 	if (dso__kernel(map__dso(dloc->ms->map))) {
 		u64 addr;
-- 
cgit v1.2.3


From ce66d7c703d32851ceb50511e634712049f0c1c1 Mon Sep 17 00:00:00 2001
From: Yang Ruibin <11162571@vivo.com>
Date: Wed, 21 Aug 2024 06:14:56 -0400
Subject: perf bpf: Remove redundant check that map is NULL

The check that map is NULL is already done in the bpf_map__fd(map) and
returns an errno, which does not run further checks.

In addition, even if the check for map is run, the return is a pointer,
which is not consistent with the err_number returned by bpf_map__fd(map).

Signed-off-by: Yang Ruibin <11162571@vivo.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: opensource.kernel@vivo.com
Link: https://lore.kernel.org/r/20240821101500.4568-1-11162571@vivo.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_map.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_map.c b/tools/perf/util/bpf_map.c
index c863ae0c5cb5..578f27d2d6b4 100644
--- a/tools/perf/util/bpf_map.c
+++ b/tools/perf/util/bpf_map.c
@@ -35,9 +35,6 @@ int bpf_map__fprintf(struct bpf_map *map, FILE *fp)
 	if (fd < 0)
 		return fd;
 
-	if (!map)
-		return PTR_ERR(map);
-
 	err = -ENOMEM;
 	key = malloc(bpf_map__key_size(map));
 	if (key == NULL)
-- 
cgit v1.2.3


From 7a5c2170244b4f55588353e68ebbafc249597ee6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 19 Aug 2024 16:36:02 -0700
Subject: perf annotate-data: Show offset and size in hex

It'd be better to have them in hex to check cacheline alignment.

 Percent     offset       size  field
  100.00          0      0x1c0  struct cfs_rq    {
    0.00          0       0x10      struct load_weight  load {
    0.00          0        0x8          long unsigned int       weight;
    0.00        0x8        0x4          u32     inv_weight;
                                    };
    0.00       0x10        0x4      unsigned int        nr_running;
   14.56       0x14        0x4      unsigned int        h_nr_running;
    0.00       0x18        0x4      unsigned int        idle_nr_running;
    0.00       0x1c        0x4      unsigned int        idle_h_nr_running;
  ...

Committer notes:

Justification from Namhyung when asked about why it would be "better":

Cache line sizes are power of 2 so it'd be natural to use hex and
check whether an offset is in the same boundary.  Also 'perf annotate'
shows instruction offsets in hex.

>
> Maybe this should be selectable?

I can add an option and/or a config if you want.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240819233603.54941-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate-data.c | 4 ++--
 tools/perf/util/annotate-data.c        | 2 +-
 tools/perf/util/sort.c                 | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c
index f563a3bb072c..cd562a8822b7 100644
--- a/tools/perf/ui/browsers/annotate-data.c
+++ b/tools/perf/ui/browsers/annotate-data.c
@@ -427,12 +427,12 @@ static void browser__write(struct ui_browser *uib, void *entry, int row)
 
 	/* print type info */
 	if (be->indent == 0 && !member->var_name) {
-		ui_browser__printf(uib, " %10d %10d  %s%s",
+		ui_browser__printf(uib, " %#10x %#10x  %s%s",
 				   member->offset, member->size,
 				   member->type_name,
 				   list_empty(&member->children) || be->folded? ";" : " {");
 	} else {
-		ui_browser__printf(uib, " %10d %10d  %*s%s\t%s%s",
+		ui_browser__printf(uib, " %#10x %#10x  %*s%s\t%s%s",
 				   member->offset, member->size,
 				   be->indent * 4, "", member->type_name,
 				   member->var_name ?: "",
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index f5eefcb71c4f..1e8328dde720 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1704,7 +1704,7 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type,
 		nr_events++;
 	}
 
-	printf(" %10d %10d  %*s%s\t%s",
+	printf(" %#10x %#10x  %*s%s\t%s",
 	       member->offset, member->size, indent, "", member->type_name,
 	       member->var_name ?: "");
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index c4046d5d1749..d315308f9170 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2312,7 +2312,7 @@ static int hist_entry__typeoff_snprintf(struct hist_entry *he, char *bf,
 				 he->mem_type_off, true);
 	buf[4095] = '\0';
 
-	return repsep_snprintf(bf, size, "%s %+d (%s)", he_type->self.type_name,
+	return repsep_snprintf(bf, size, "%s +%#x (%s)", he_type->self.type_name,
 			       he->mem_type_off, buf);
 }
 
-- 
cgit v1.2.3


From fd45d52eae5c42fdb68802127fa98d3718c80043 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 19 Aug 2024 16:36:03 -0700
Subject: perf annotate-data: Add 'typecln' sort key

Sometimes it's useful to organize member fields in cache-line boundary.

The 'typecln' sort key is short for type-cacheline and to show samples
in each cacheline.  The cacheline size is fixed to 64 for now, but it
can read the actual size once it saves the value from sysfs.

For example, you maybe want to which cacheline in a target is hot or
cold.  The following shows members in the cfs_rq's first cache line.

  $ perf report -s type,typecln,typeoff -H
  ...
  -    2.67%        struct cfs_rq
     +    1.23%        struct cfs_rq: cache-line 2
     +    0.57%        struct cfs_rq: cache-line 4
     +    0.46%        struct cfs_rq: cache-line 6
     -    0.41%        struct cfs_rq: cache-line 0
             0.39%        struct cfs_rq +0x14 (h_nr_running)
             0.02%        struct cfs_rq +0x38 (tasks_timeline.rb_leftmost)
  ...

Committer testing:

  # root@number:~# perf report -s type,typecln,typeoff -H --stdio
  # Total Lost Samples: 0
  #
  # Samples: 5K of event 'cpu_atom/mem-loads,ldlat=5/P'
  # Event count (approx.): 312251
  #
  #       Overhead  Data Type / Data Type Cacheline / Data Type Offset
  # ..............  ..................................................
  #
  <SNIP>
       0.07%        struct sigaction
          0.05%        struct sigaction: cache-line 1
             0.02%        struct sigaction +0x58 (sa_mask)
             0.02%        struct sigaction +0x78 (sa_mask)
          0.03%        struct sigaction: cache-line 0
             0.02%        struct sigaction +0x38 (sa_mask)
             0.01%        struct sigaction +0x8 (sa_mask)
  <SNIP>

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240819233603.54941-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.h |  1 +
 tools/perf/util/sort.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/sort.h |  1 +
 3 files changed, 54 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 30c13fc8cbe4..deb1087c5948 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -86,6 +86,7 @@ enum hist_column {
 	HISTC_TYPE,
 	HISTC_TYPE_OFFSET,
 	HISTC_SYMBOL_OFFSET,
+	HISTC_TYPE_CACHELINE,
 	HISTC_NR_COLS, /* Last entry */
 };
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index d315308f9170..013020f33ece 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2326,6 +2326,57 @@ struct sort_entry sort_type_offset = {
 	.se_width_idx	= HISTC_TYPE_OFFSET,
 };
 
+/* --sort typecln */
+
+/* TODO: use actual value in the system */
+#define TYPE_CACHELINE_SIZE  64
+
+static int64_t
+sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
+{
+	struct annotated_data_type *left_type = left->mem_type;
+	struct annotated_data_type *right_type = right->mem_type;
+	int64_t left_cln, right_cln;
+	int64_t ret;
+
+	if (!left_type) {
+		sort__type_init(left);
+		left_type = left->mem_type;
+	}
+
+	if (!right_type) {
+		sort__type_init(right);
+		right_type = right->mem_type;
+	}
+
+	ret = strcmp(left_type->self.type_name, right_type->self.type_name);
+	if (ret)
+		return ret;
+
+	left_cln = left->mem_type_off / TYPE_CACHELINE_SIZE;
+	right_cln = right->mem_type_off / TYPE_CACHELINE_SIZE;
+	return left_cln - right_cln;
+}
+
+static int hist_entry__typecln_snprintf(struct hist_entry *he, char *bf,
+				     size_t size, unsigned int width __maybe_unused)
+{
+	struct annotated_data_type *he_type = he->mem_type;
+
+	return repsep_snprintf(bf, size, "%s: cache-line %d", he_type->self.type_name,
+			       he->mem_type_off / TYPE_CACHELINE_SIZE);
+}
+
+struct sort_entry sort_type_cacheline = {
+	.se_header	= "Data Type Cacheline",
+	.se_cmp		= sort__type_cmp,
+	.se_collapse	= sort__typecln_sort,
+	.se_sort	= sort__typecln_sort,
+	.se_init	= sort__type_init,
+	.se_snprintf	= hist_entry__typecln_snprintf,
+	.se_width_idx	= HISTC_TYPE_CACHELINE,
+};
+
 
 struct sort_dimension {
 	const char		*name;
@@ -2384,6 +2435,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_ANNOTATE_DATA_TYPE, "type", sort_type),
 	DIM(SORT_ANNOTATE_DATA_TYPE_OFFSET, "typeoff", sort_type_offset),
 	DIM(SORT_SYM_OFFSET, "symoff", sort_sym_offset),
+	DIM(SORT_ANNOTATE_DATA_TYPE_CACHELINE, "typecln", sort_type_cacheline),
 };
 
 #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 6357bc32c5ca..9ff68c6786e7 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -71,6 +71,7 @@ enum sort_type {
 	SORT_ANNOTATE_DATA_TYPE,
 	SORT_ANNOTATE_DATA_TYPE_OFFSET,
 	SORT_SYM_OFFSET,
+	SORT_ANNOTATE_DATA_TYPE_CACHELINE,
 
 	/* branch stack specific sort keys */
 	__SORT_BRANCH_STACK,
-- 
cgit v1.2.3


From b0cd726f9a8279b33faa5b4b0011df14f331fb33 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 8 Aug 2024 16:22:30 -0700
Subject: selftests/bpf: test passing iterator to a kfunc

Define BPF iterator "getter" kfunc, which accepts iterator pointer as
one of the arguments. Make sure that argument passed doesn't have to be
the very first argument (unlike new-next-destroy combo).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240808232230.2848712-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        | 16 +++++--
 .../selftests/bpf/progs/iters_testmod_seq.c        | 50 ++++++++++++++++++++++
 2 files changed, 62 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 3687a40b61c6..c04b7dec2ab9 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -141,13 +141,12 @@ bpf_testmod_test_mod_kfunc(int i)
 
 __bpf_kfunc int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt)
 {
-	if (cnt < 0) {
-		it->cnt = 0;
+	it->cnt = cnt;
+
+	if (cnt < 0)
 		return -EINVAL;
-	}
 
 	it->value = value;
-	it->cnt = cnt;
 
 	return 0;
 }
@@ -162,6 +161,14 @@ __bpf_kfunc s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq* it)
 	return &it->value;
 }
 
+__bpf_kfunc s64 bpf_iter_testmod_seq_value(int val, struct bpf_iter_testmod_seq* it__iter)
+{
+	if (it__iter->cnt < 0)
+		return 0;
+
+	return val + it__iter->value;
+}
+
 __bpf_kfunc void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it)
 {
 	it->cnt = 0;
@@ -531,6 +538,7 @@ BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value)
 BTF_ID_FLAGS(func, bpf_kfunc_common_test)
 BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test)
 BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c
index 3873fb6c292a..4a176e6aede8 100644
--- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c
+++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c
@@ -12,6 +12,7 @@ struct bpf_iter_testmod_seq {
 
 extern int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) __ksym;
 extern s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq *it) __ksym;
+extern s64 bpf_iter_testmod_seq_value(int blah, struct bpf_iter_testmod_seq *it) __ksym;
 extern void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) __ksym;
 
 const volatile __s64 exp_empty = 0 + 1;
@@ -76,4 +77,53 @@ int testmod_seq_truncated(const void *ctx)
 	return 0;
 }
 
+SEC("?raw_tp")
+__failure
+__msg("expected an initialized iter_testmod_seq as arg #2")
+int testmod_seq_getter_before_bad(const void *ctx)
+{
+	struct bpf_iter_testmod_seq it;
+
+	return bpf_iter_testmod_seq_value(0, &it);
+}
+
+SEC("?raw_tp")
+__failure
+__msg("expected an initialized iter_testmod_seq as arg #2")
+int testmod_seq_getter_after_bad(const void *ctx)
+{
+	struct bpf_iter_testmod_seq it;
+	s64 sum = 0, *v;
+
+	bpf_iter_testmod_seq_new(&it, 100, 100);
+
+	while ((v = bpf_iter_testmod_seq_next(&it))) {
+		sum += *v;
+	}
+
+	bpf_iter_testmod_seq_destroy(&it);
+
+	return sum + bpf_iter_testmod_seq_value(0, &it);
+}
+
+SEC("?socket")
+__success __retval(1000000)
+int testmod_seq_getter_good(const void *ctx)
+{
+	struct bpf_iter_testmod_seq it;
+	s64 sum = 0, *v;
+
+	bpf_iter_testmod_seq_new(&it, 100, 100);
+
+	while ((v = bpf_iter_testmod_seq_next(&it))) {
+		sum += *v;
+	}
+
+	sum *= bpf_iter_testmod_seq_value(0, &it);
+
+	bpf_iter_testmod_seq_destroy(&it);
+
+	return sum;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 7d41dad105b6667745f0984e386caf3289cef6db Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 20 Aug 2024 03:23:49 -0700
Subject: selftests/bpf: less spam in the log for message matching

When running test_loader based tests in the verbose mode each matched
message leaves a trace in the stderr, e.g.:

    ./test_progs -vvv -t ...
    validate_msgs:PASS:expect_msg 0 nsec
    validate_msgs:PASS:expect_msg 0 nsec
    validate_msgs:PASS:expect_msg 0 nsec
    validate_msgs:PASS:expect_msg 0 nsec
    validate_msgs:PASS:expect_msg 0 nsec

This is not very helpful when debugging such tests and clobbers the
log a lot.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240820102357.3372779-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_loader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 12b0c41e8d64..1b1290e090e7 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -531,7 +531,8 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs,
 			}
 		}
 
-		if (!ASSERT_OK_PTR(match, "expect_msg")) {
+		if (!match) {
+			PRINT_FAIL("expect_msg\n");
 			if (env.verbosity == VERBOSE_NONE)
 				emit_fn(log_buf, true /*force*/);
 			for (j = 0; j <= i; j++) {
-- 
cgit v1.2.3


From d0a29cdb6ef95d8a175e09ab2d1334271f047e60 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 20 Aug 2024 03:23:50 -0700
Subject: selftests/bpf: correctly move 'log' upon successful match

Suppose log="foo bar buz" and msg->substr="bar".
In such case current match processing logic would update 'log' as
follows: log += strlen(msg->substr); -> log += 3 -> log=" bar".
However, the intent behind the 'log' update is to make it point after
the successful match, e.g. to make log=" buz" in the example above.

Fixes: 4ef5d6af4935 ("selftests/bpf: no need to track next_match_pos in struct test_loader")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240820102357.3372779-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_loader.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 1b1290e090e7..f5f5d16ac550 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -522,7 +522,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs,
 		if (msg->substr) {
 			match = strstr(log, msg->substr);
 			if (match)
-				log += strlen(msg->substr);
+				log = match + strlen(msg->substr);
 		} else {
 			err = regexec(&msg->regex, log, 1, reg_match, 0);
 			if (err == 0) {
-- 
cgit v1.2.3


From f00bb757ed630affc951691ddaff206039cbb7ee Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 20 Aug 2024 03:23:51 -0700
Subject: selftests/bpf: fix to avoid __msg tag de-duplication by clang

__msg, __regex and __xlated tags are based on
__attribute__((btf_decl_tag("..."))) annotations.

Clang de-duplicates such annotations, e.g. the following
two sequences of tags are identical in final BTF:

    /* seq A */            /* seq B */
    __tag("foo")           __tag("foo")
    __tag("bar")           __tag("bar")
    __tag("foo")

Fix this by adding a unique suffix for each tag using __COUNTER__
pre-processor macro. E.g. here is a new definition for __msg:

    #define __msg(msg) \
      __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg)))

Using this definition the "seq A" from example above is translated to
BTF as follows:

    [..] DECL_TAG 'comment:test_expect_msg=0=foo' type_id=X component_idx=-1
    [..] DECL_TAG 'comment:test_expect_msg=1=bar' type_id=X component_idx=-1
    [..] DECL_TAG 'comment:test_expect_msg=2=foo' type_id=X component_idx=-1

Surprisingly, this bug affects a single existing test:
verifier_spill_fill/old_stack_misc_vs_cur_ctx_ptr,
where sequence of identical messages was expected in the log.

Fixes: 537c3f66eac1 ("selftests/bpf: add generic BPF program tester-loader")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240820102357.3372779-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/bpf_misc.h       | 15 ++++---
 .../selftests/bpf/progs/verifier_spill_fill.c      |  8 ++--
 tools/testing/selftests/bpf/test_loader.c          | 47 ++++++++++++++++------
 3 files changed, 48 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index a225cd87897c..4f1029743734 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -2,6 +2,9 @@
 #ifndef __BPF_MISC_H__
 #define __BPF_MISC_H__
 
+#define XSTR(s) STR(s)
+#define STR(s) #s
+
 /* This set of attributes controls behavior of the
  * test_loader.c:test_loader__run_subtests().
  *
@@ -68,15 +71,15 @@
  *                   Several __arch_* annotations could be specified at once.
  *                   When test case is not run on current arch it is marked as skipped.
  */
-#define __msg(msg)		__attribute__((btf_decl_tag("comment:test_expect_msg=" msg)))
-#define __regex(regex)		__attribute__((btf_decl_tag("comment:test_expect_regex=" regex)))
-#define __xlated(msg)		__attribute__((btf_decl_tag("comment:test_expect_xlated=" msg)))
+#define __msg(msg)		__attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg)))
+#define __regex(regex)		__attribute__((btf_decl_tag("comment:test_expect_regex=" XSTR(__COUNTER__) "=" regex)))
+#define __xlated(msg)		__attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg)))
 #define __failure		__attribute__((btf_decl_tag("comment:test_expect_failure")))
 #define __success		__attribute__((btf_decl_tag("comment:test_expect_success")))
 #define __description(desc)	__attribute__((btf_decl_tag("comment:test_description=" desc)))
-#define __msg_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" msg)))
-#define __regex_unpriv(regex)	__attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" regex)))
-#define __xlated_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" msg)))
+#define __msg_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg)))
+#define __regex_unpriv(regex)	__attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" XSTR(__COUNTER__) "=" regex)))
+#define __xlated_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg)))
 #define __failure_unpriv	__attribute__((btf_decl_tag("comment:test_expect_failure_unpriv")))
 #define __success_unpriv	__attribute__((btf_decl_tag("comment:test_expect_success_unpriv")))
 #define __log_level(lvl)	__attribute__((btf_decl_tag("comment:test_log_level="#lvl)))
diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
index 9d288ec7a168..671d9f415dbf 100644
--- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
+++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
@@ -1213,10 +1213,10 @@ __success  __log_level(2)
  * - once for path entry - label 2;
  * - once for path entry - label 1 - label 2.
  */
-__msg("r1 = *(u64 *)(r10 -8)")
-__msg("exit")
-__msg("r1 = *(u64 *)(r10 -8)")
-__msg("exit")
+__msg("8: (79) r1 = *(u64 *)(r10 -8)")
+__msg("9: (95) exit")
+__msg("from 2 to 7")
+__msg("8: safe")
 __msg("processed 11 insns")
 __flag(BPF_F_TEST_STATE_FREQ)
 __naked void old_stack_misc_vs_cur_ctx_ptr(void)
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index f5f5d16ac550..c604a82e03c4 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -215,6 +215,35 @@ static void update_flags(int *flags, int flag, bool clear)
 		*flags |= flag;
 }
 
+/* Matches a string of form '<pfx>[^=]=.*' and returns it's suffix.
+ * Used to parse btf_decl_tag values.
+ * Such values require unique prefix because compiler does not add
+ * same __attribute__((btf_decl_tag(...))) twice.
+ * Test suite uses two-component tags for such cases:
+ *
+ *   <pfx> __COUNTER__ '='
+ *
+ * For example, two consecutive __msg tags '__msg("foo") __msg("foo")'
+ * would be encoded as:
+ *
+ *   [18] DECL_TAG 'comment:test_expect_msg=0=foo' type_id=15 component_idx=-1
+ *   [19] DECL_TAG 'comment:test_expect_msg=1=foo' type_id=15 component_idx=-1
+ *
+ * And the purpose of this function is to extract 'foo' from the above.
+ */
+static const char *skip_dynamic_pfx(const char *s, const char *pfx)
+{
+	const char *msg;
+
+	if (strncmp(s, pfx, strlen(pfx)) != 0)
+		return NULL;
+	msg = s + strlen(pfx);
+	msg = strchr(msg, '=');
+	if (!msg)
+		return NULL;
+	return msg + 1;
+}
+
 enum arch {
 	ARCH_X86_64	= 0x1,
 	ARCH_ARM64	= 0x2,
@@ -290,38 +319,32 @@ static int parse_test_spec(struct test_loader *tester,
 		} else if (strcmp(s, TEST_TAG_AUXILIARY_UNPRIV) == 0) {
 			spec->auxiliary = true;
 			spec->mode_mask |= UNPRIV;
-		} else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) {
-			msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1;
+		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) {
 			err = push_msg(msg, NULL, &spec->priv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= PRIV;
-		} else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV)) {
-			msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX_UNPRIV) - 1;
+		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) {
 			err = push_msg(msg, NULL, &spec->unpriv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
-		} else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX)) {
-			msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX) - 1;
+		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX))) {
 			err = push_msg(NULL, msg, &spec->priv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= PRIV;
-		} else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV)) {
-			msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX_UNPRIV) - 1;
+		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV))) {
 			err = push_msg(NULL, msg, &spec->unpriv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
-		} else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX)) {
-			msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX) - 1;
+		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) {
 			err = push_msg(msg, NULL, &spec->priv.expect_xlated);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= PRIV;
-		} else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV)) {
-			msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX_UNPRIV) - 1;
+		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) {
 			err = push_msg(msg, NULL, &spec->unpriv.expect_xlated);
 			if (err)
 				goto cleanup;
-- 
cgit v1.2.3


From f8d161756d422598e10a112171a73cf621e67fae Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 20 Aug 2024 03:23:52 -0700
Subject: selftests/bpf: replace __regex macro with "{{...}}" patterns

Upcoming changes require a notation to specify regular expression
matches for regular verifier log messages, disassembly of BPF
instructions, disassembly of jited instructions.

Neither basic nor extended POSIX regular expressions w/o additional
escaping are good for this role because of wide use of special
characters in disassembly, for example:

    movq -0x10(%rbp), %rax  ;; () are special characters
    cmpq $0x21, %rax        ;; $ is a special character

    *(u64 *)(r10 -16) = r1  ;; * and () are special characters

This commit borrows syntax from LLVM's FileCheck utility.
It replaces __regex macro with ability to embed regular expressions
in __msg patters using "{{" "}}" pairs for escaping.
Syntax for __msg patterns:

    pattern := (<verbatim text> | regex)*
    regex := "{{" <posix extended regular expression> "}}"

For example, pattern "foo{{[0-9]+}}" matches strings like
"foo0", "foo007", etc.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240820102357.3372779-5-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/bpf_misc.h       |   9 +-
 tools/testing/selftests/bpf/progs/dynptr_fail.c    |   6 +-
 tools/testing/selftests/bpf/progs/rbtree_fail.c    |   2 +-
 .../selftests/bpf/progs/refcounted_kptr_fail.c     |   4 +-
 tools/testing/selftests/bpf/test_loader.c          | 164 +++++++++++++--------
 5 files changed, 115 insertions(+), 70 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 4f1029743734..cc3ef20a6490 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -25,12 +25,15 @@
  *
  * __msg             Message expected to be found in the verifier log.
  *                   Multiple __msg attributes could be specified.
+ *                   To match a regular expression use "{{" "}}" brackets,
+ *                   e.g. "foo{{[0-9]+}}"  matches strings like "foo007".
+ *                   Extended POSIX regular expression syntax is allowed
+ *                   inside the brackets.
  * __msg_unpriv      Same as __msg but for unprivileged mode.
  *
- * __regex           Same as __msg, but using a regular expression.
- * __regex_unpriv    Same as __msg_unpriv but using a regular expression.
  * __xlated          Expect a line in a disassembly log after verifier applies rewrites.
  *                   Multiple __xlated attributes could be specified.
+ *                   Regular expressions could be specified same way as in __msg.
  * __xlated_unpriv   Same as __xlated but for unprivileged mode.
  *
  * __success         Expect program load success in privileged mode.
@@ -72,13 +75,11 @@
  *                   When test case is not run on current arch it is marked as skipped.
  */
 #define __msg(msg)		__attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg)))
-#define __regex(regex)		__attribute__((btf_decl_tag("comment:test_expect_regex=" XSTR(__COUNTER__) "=" regex)))
 #define __xlated(msg)		__attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg)))
 #define __failure		__attribute__((btf_decl_tag("comment:test_expect_failure")))
 #define __success		__attribute__((btf_decl_tag("comment:test_expect_success")))
 #define __description(desc)	__attribute__((btf_decl_tag("comment:test_description=" desc)))
 #define __msg_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg)))
-#define __regex_unpriv(regex)	__attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" XSTR(__COUNTER__) "=" regex)))
 #define __xlated_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg)))
 #define __failure_unpriv	__attribute__((btf_decl_tag("comment:test_expect_failure_unpriv")))
 #define __success_unpriv	__attribute__((btf_decl_tag("comment:test_expect_success_unpriv")))
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index e35bc1eac52a..68b8c6eca508 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -964,7 +964,7 @@ int dynptr_invalidate_slice_reinit(void *ctx)
  * mem_or_null pointers.
  */
 SEC("?raw_tp")
-__failure __regex("R[0-9]+ type=scalar expected=percpu_ptr_")
+__failure __msg("R{{[0-9]+}} type=scalar expected=percpu_ptr_")
 int dynptr_invalidate_slice_or_null(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -982,7 +982,7 @@ int dynptr_invalidate_slice_or_null(void *ctx)
 
 /* Destruction of dynptr should also any slices obtained from it */
 SEC("?raw_tp")
-__failure __regex("R[0-9]+ invalid mem access 'scalar'")
+__failure __msg("R{{[0-9]+}} invalid mem access 'scalar'")
 int dynptr_invalidate_slice_failure(void *ctx)
 {
 	struct bpf_dynptr ptr1;
@@ -1069,7 +1069,7 @@ int dynptr_read_into_slot(void *ctx)
 
 /* bpf_dynptr_slice()s are read-only and cannot be written to */
 SEC("?tc")
-__failure __regex("R[0-9]+ cannot write into rdonly_mem")
+__failure __msg("R{{[0-9]+}} cannot write into rdonly_mem")
 int skb_invalid_slice_write(struct __sk_buff *skb)
 {
 	struct bpf_dynptr ptr;
diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c
index b722a1e1ddef..dbd5eee8e25e 100644
--- a/tools/testing/selftests/bpf/progs/rbtree_fail.c
+++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c
@@ -105,7 +105,7 @@ long rbtree_api_remove_unadded_node(void *ctx)
 }
 
 SEC("?tc")
-__failure __regex("Unreleased reference id=3 alloc_insn=[0-9]+")
+__failure __msg("Unreleased reference id=3 alloc_insn={{[0-9]+}}")
 long rbtree_api_remove_no_drop(void *ctx)
 {
 	struct bpf_rb_node *res;
diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c
index f8d4b7cfcd68..836c8ab7b908 100644
--- a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c
@@ -32,7 +32,7 @@ static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
 }
 
 SEC("?tc")
-__failure __regex("Unreleased reference id=4 alloc_insn=[0-9]+")
+__failure __msg("Unreleased reference id=4 alloc_insn={{[0-9]+}}")
 long rbtree_refcounted_node_ref_escapes(void *ctx)
 {
 	struct node_acquire *n, *m;
@@ -73,7 +73,7 @@ long refcount_acquire_maybe_null(void *ctx)
 }
 
 SEC("?tc")
-__failure __regex("Unreleased reference id=3 alloc_insn=[0-9]+")
+__failure __msg("Unreleased reference id=3 alloc_insn={{[0-9]+}}")
 long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx)
 {
 	struct node_acquire *n, *m;
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index c604a82e03c4..b0d7158e00c1 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -19,12 +19,10 @@
 #define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure"
 #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success"
 #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg="
-#define TEST_TAG_EXPECT_REGEX_PFX "comment:test_expect_regex="
 #define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated="
 #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv"
 #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv"
 #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv="
-#define TEST_TAG_EXPECT_REGEX_PFX_UNPRIV "comment:test_expect_regex_unpriv="
 #define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv="
 #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level="
 #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags="
@@ -55,8 +53,9 @@ enum mode {
 
 struct expect_msg {
 	const char *substr; /* substring match */
-	const char *regex_str; /* regex-based match */
 	regex_t regex;
+	bool is_regex;
+	bool on_next_line;
 };
 
 struct expected_msgs {
@@ -111,7 +110,7 @@ static void free_msgs(struct expected_msgs *msgs)
 	int i;
 
 	for (i = 0; i < msgs->cnt; i++)
-		if (msgs->patterns[i].regex_str)
+		if (msgs->patterns[i].is_regex)
 			regfree(&msgs->patterns[i].regex);
 	free(msgs->patterns);
 	msgs->patterns = NULL;
@@ -132,12 +131,71 @@ static void free_test_spec(struct test_spec *spec)
 	spec->unpriv.name = NULL;
 }
 
-static int push_msg(const char *substr, const char *regex_str, struct expected_msgs *msgs)
+/* Compiles regular expression matching pattern.
+ * Pattern has a special syntax:
+ *
+ *   pattern := (<verbatim text> | regex)*
+ *   regex := "{{" <posix extended regular expression> "}}"
+ *
+ * In other words, pattern is a verbatim text with inclusion
+ * of regular expressions enclosed in "{{" "}}" pairs.
+ * For example, pattern "foo{{[0-9]+}}" matches strings like
+ * "foo0", "foo007", etc.
+ */
+static int compile_regex(const char *pattern, regex_t *regex)
+{
+	char err_buf[256], buf[256] = {}, *ptr, *buf_end;
+	const char *original_pattern = pattern;
+	bool in_regex = false;
+	int err;
+
+	buf_end = buf + sizeof(buf);
+	ptr = buf;
+	while (*pattern && ptr < buf_end - 2) {
+		if (!in_regex && str_has_pfx(pattern, "{{")) {
+			in_regex = true;
+			pattern += 2;
+			continue;
+		}
+		if (in_regex && str_has_pfx(pattern, "}}")) {
+			in_regex = false;
+			pattern += 2;
+			continue;
+		}
+		if (in_regex) {
+			*ptr++ = *pattern++;
+			continue;
+		}
+		/* list of characters that need escaping for extended posix regex */
+		if (strchr(".[]\\()*+?{}|^$", *pattern)) {
+			*ptr++ = '\\';
+			*ptr++ = *pattern++;
+			continue;
+		}
+		*ptr++ = *pattern++;
+	}
+	if (*pattern) {
+		PRINT_FAIL("Regexp too long: '%s'\n", original_pattern);
+		return -EINVAL;
+	}
+	if (in_regex) {
+		PRINT_FAIL("Regexp has open '{{' but no closing '}}': '%s'\n", original_pattern);
+		return -EINVAL;
+	}
+	err = regcomp(regex, buf, REG_EXTENDED | REG_NEWLINE);
+	if (err != 0) {
+		regerror(err, regex, err_buf, sizeof(err_buf));
+		PRINT_FAIL("Regexp compilation error in '%s': '%s'\n", buf, err_buf);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int __push_msg(const char *pattern, bool on_next_line, struct expected_msgs *msgs)
 {
-	void *tmp;
-	int regcomp_res;
-	char error_msg[100];
 	struct expect_msg *msg;
+	void *tmp;
+	int err;
 
 	tmp = realloc(msgs->patterns,
 		      (1 + msgs->cnt) * sizeof(struct expect_msg));
@@ -147,26 +205,38 @@ static int push_msg(const char *substr, const char *regex_str, struct expected_m
 	}
 	msgs->patterns = tmp;
 	msg = &msgs->patterns[msgs->cnt];
-
-	if (substr) {
-		msg->substr = substr;
-		msg->regex_str = NULL;
-	} else {
-		msg->regex_str = regex_str;
-		msg->substr = NULL;
-		regcomp_res = regcomp(&msg->regex, regex_str, REG_EXTENDED|REG_NEWLINE);
-		if (regcomp_res != 0) {
-			regerror(regcomp_res, &msg->regex, error_msg, sizeof(error_msg));
-			PRINT_FAIL("Regexp compilation error in '%s': '%s'\n",
-				   regex_str, error_msg);
-			return -EINVAL;
-		}
+	msg->on_next_line = on_next_line;
+	msg->substr = pattern;
+	msg->is_regex = false;
+	if (strstr(pattern, "{{")) {
+		err = compile_regex(pattern, &msg->regex);
+		if (err)
+			return err;
+		msg->is_regex = true;
 	}
-
 	msgs->cnt += 1;
 	return 0;
 }
 
+static int clone_msgs(struct expected_msgs *from, struct expected_msgs *to)
+{
+	struct expect_msg *msg;
+	int i, err;
+
+	for (i = 0; i < from->cnt; i++) {
+		msg = &from->patterns[i];
+		err = __push_msg(msg->substr, msg->on_next_line, to);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int push_msg(const char *substr, struct expected_msgs *msgs)
+{
+	return __push_msg(substr, false, msgs);
+}
+
 static int parse_int(const char *str, int *val, const char *name)
 {
 	char *end;
@@ -320,32 +390,22 @@ static int parse_test_spec(struct test_loader *tester,
 			spec->auxiliary = true;
 			spec->mode_mask |= UNPRIV;
 		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) {
-			err = push_msg(msg, NULL, &spec->priv.expect_msgs);
+			err = push_msg(msg, &spec->priv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= PRIV;
 		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) {
-			err = push_msg(msg, NULL, &spec->unpriv.expect_msgs);
-			if (err)
-				goto cleanup;
-			spec->mode_mask |= UNPRIV;
-		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX))) {
-			err = push_msg(NULL, msg, &spec->priv.expect_msgs);
-			if (err)
-				goto cleanup;
-			spec->mode_mask |= PRIV;
-		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV))) {
-			err = push_msg(NULL, msg, &spec->unpriv.expect_msgs);
+			err = push_msg(msg, &spec->unpriv.expect_msgs);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
 		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) {
-			err = push_msg(msg, NULL, &spec->priv.expect_xlated);
+			err = push_msg(msg, &spec->priv.expect_xlated);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= PRIV;
 		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) {
-			err = push_msg(msg, NULL, &spec->unpriv.expect_xlated);
+			err = push_msg(msg, &spec->unpriv.expect_xlated);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
@@ -457,26 +517,10 @@ static int parse_test_spec(struct test_loader *tester,
 			spec->unpriv.execute = spec->priv.execute;
 		}
 
-		if (spec->unpriv.expect_msgs.cnt == 0) {
-			for (i = 0; i < spec->priv.expect_msgs.cnt; i++) {
-				struct expect_msg *msg = &spec->priv.expect_msgs.patterns[i];
-
-				err = push_msg(msg->substr, msg->regex_str,
-					       &spec->unpriv.expect_msgs);
-				if (err)
-					goto cleanup;
-			}
-		}
-		if (spec->unpriv.expect_xlated.cnt == 0) {
-			for (i = 0; i < spec->priv.expect_xlated.cnt; i++) {
-				struct expect_msg *msg = &spec->priv.expect_xlated.patterns[i];
-
-				err = push_msg(msg->substr, msg->regex_str,
-					       &spec->unpriv.expect_xlated);
-				if (err)
-					goto cleanup;
-			}
-		}
+		if (spec->unpriv.expect_msgs.cnt == 0)
+			clone_msgs(&spec->priv.expect_msgs, &spec->unpriv.expect_msgs);
+		if (spec->unpriv.expect_xlated.cnt == 0)
+			clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated);
 	}
 
 	spec->valid = true;
@@ -542,7 +586,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs,
 		struct expect_msg *msg = &msgs->patterns[i];
 		const char *match = NULL;
 
-		if (msg->substr) {
+		if (!msg->is_regex) {
 			match = strstr(log, msg->substr);
 			if (match)
 				log = match + strlen(msg->substr);
@@ -562,8 +606,8 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs,
 				msg = &msgs->patterns[j];
 				fprintf(stderr, "%s %s: '%s'\n",
 					j < i ? "MATCHED " : "EXPECTED",
-					msg->substr ? "SUBSTR" : " REGEX",
-					msg->substr ?: msg->regex_str);
+					msg->is_regex ? " REGEX" : "SUBSTR",
+					msg->substr);
 			}
 			return;
 		}
-- 
cgit v1.2.3


From b991fc52070042468f31b70fb8ccab96ba351f8a Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 20 Aug 2024 03:23:53 -0700
Subject: selftests/bpf: utility function to get program disassembly after jit

This commit adds a utility function to get disassembled text for jited
representation of a BPF program designated by file descriptor.
Function prototype looks as follows:

    int get_jited_program_text(int fd, char *text, size_t text_sz)

Where 'fd' is a file descriptor for the program, 'text' and 'text_sz'
refer to a destination buffer for disassembled text.
Output format looks as follows:

    18:	77 06                               	ja	L0
    1a:	50                                  	pushq	%rax
    1b:	48 89 e0                            	movq	%rsp, %rax
    1e:	eb 01                               	jmp	L1
    20:	50                                  L0:	pushq	%rax
    21:	50                                  L1:	pushq	%rax
     ^  ^^^^^^^^                             ^  ^^^^^^^^^^^^^^^^^^
     |  binary insn                          |  textual insn
     |  representation                       |  representation
     |                                       |
    instruction offset              inferred local label name

The code and makefile changes are inspired by jit_disasm.c from bpftool.
Use llvm libraries to disassemble BPF program instead of libbfd to avoid
issues with disassembly output stability pointed out in [1].

Selftests makefile uses Makefile.feature to detect if LLVM libraries
are available. If that is not the case selftests build proceeds but
the function returns -EOPNOTSUPP at runtime.

[1] commit eb9d1acf634b ("bpftool: Add LLVM as default library for disassembling JIT-ed programs")

Acked-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240820102357.3372779-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore           |   1 +
 tools/testing/selftests/bpf/Makefile             |  48 ++++-
 tools/testing/selftests/bpf/jit_disasm_helpers.c | 234 +++++++++++++++++++++++
 tools/testing/selftests/bpf/jit_disasm_helpers.h |  10 +
 4 files changed, 291 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/jit_disasm_helpers.c
 create mode 100644 tools/testing/selftests/bpf/jit_disasm_helpers.h

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 99ffea1fa5c6..e6533b3400de 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -8,6 +8,7 @@ test_lru_map
 test_lpm_map
 test_tag
 FEATURE-DUMP.libbpf
+FEATURE-DUMP.selftests
 fixdep
 /test_progs
 /test_progs-no_alu32
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7bca9a2b8bc0..6d6d0c29db47 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -33,6 +33,13 @@ OPT_FLAGS	?= $(if $(RELEASE),-O2,-O0)
 LIBELF_CFLAGS	:= $(shell $(PKG_CONFIG) libelf --cflags 2>/dev/null)
 LIBELF_LIBS	:= $(shell $(PKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf)
 
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(CURDIR)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+endif
+
 CFLAGS += -g $(OPT_FLAGS) -rdynamic					\
 	  -Wall -Werror -fno-omit-frame-pointer				\
 	  $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS)			\
@@ -60,6 +67,9 @@ progs/timer_crash.c-CFLAGS := -fno-strict-aliasing
 progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing
 progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing
 
+# Some utility functions use LLVM libraries
+jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS)
+
 ifneq ($(LLVM),)
 # Silence some warnings when compiled with clang
 CFLAGS += -Wno-unused-command-line-argument
@@ -168,6 +178,31 @@ endef
 
 include ../lib.mk
 
+NON_CHECK_FEAT_TARGETS := clean docs-clean
+CHECK_FEAT := $(filter-out $(NON_CHECK_FEAT_TARGETS),$(or $(MAKECMDGOALS), "none"))
+ifneq ($(CHECK_FEAT),)
+FEATURE_USER := .selftests
+FEATURE_TESTS := llvm
+FEATURE_DISPLAY := $(FEATURE_TESTS)
+
+# Makefile.feature expects OUTPUT to end with a slash
+$(let OUTPUT,$(OUTPUT)/,\
+	$(eval include ../../../build/Makefile.feature))
+endif
+
+ifeq ($(feature-llvm),1)
+  LLVM_CFLAGS  += -DHAVE_LLVM_SUPPORT
+  LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets
+  # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict
+  LLVM_CFLAGS  += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags))
+  LLVM_LDLIBS  += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS))
+  ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static)
+    LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS))
+    LLVM_LDLIBS += -lstdc++
+  endif
+  LLVM_LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags)
+endif
+
 SCRATCH_DIR := $(OUTPUT)/tools
 BUILD_DIR := $(SCRATCH_DIR)/build
 INCLUDE_DIR := $(SCRATCH_DIR)/include
@@ -612,6 +647,10 @@ ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),)
 include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d))
 endif
 
+# add per extra obj CFGLAGS definitions
+$(foreach N,$(patsubst $(TRUNNER_OUTPUT)/%.o,%,$(TRUNNER_EXTRA_OBJS)),	\
+	$(eval $(TRUNNER_OUTPUT)/$(N).o: CFLAGS += $($(N).c-CFLAGS)))
+
 $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o:				\
 		       %.c						\
 		       $(TRUNNER_EXTRA_HDRS)				\
@@ -628,6 +667,9 @@ ifneq ($2:$(OUTPUT),:$(shell pwd))
 	$(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/
 endif
 
+$(OUTPUT)/$(TRUNNER_BINARY): LDLIBS += $$(LLVM_LDLIBS)
+$(OUTPUT)/$(TRUNNER_BINARY): LDFLAGS += $$(LLVM_LDFLAGS)
+
 # some X.test.o files have runtime dependencies on Y.bpf.o files
 $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS)
 
@@ -637,7 +679,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS)			\
 			     $(TRUNNER_BPFTOOL)				\
 			     | $(TRUNNER_BINARY)-extras
 	$$(call msg,BINARY,,$$@)
-	$(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@
+	$(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LDFLAGS) -o $$@
 	$(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@
 	$(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \
 		   $(OUTPUT)/$(if $2,$2/)bpftool
@@ -656,6 +698,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c		\
 			 cap_helpers.c		\
 			 unpriv_helpers.c 	\
 			 netlink_helpers.c	\
+			 jit_disasm_helpers.c	\
 			 test_loader.c		\
 			 xsk.c			\
 			 disasm.c		\
@@ -798,7 +841,8 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)			\
 	$(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h	\
 			       no_alu32 cpuv4 bpf_gcc bpf_testmod.ko	\
 			       bpf_test_no_cfi.ko			\
-			       liburandom_read.so)
+			       liburandom_read.so)			\
+	$(OUTPUT)/FEATURE-DUMP.selftests
 
 .PHONY: docs docs-clean
 
diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c
new file mode 100644
index 000000000000..1b0f1fd267c0
--- /dev/null
+++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <test_progs.h>
+
+#ifdef HAVE_LLVM_SUPPORT
+
+#include <llvm-c/Core.h>
+#include <llvm-c/Disassembler.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/TargetMachine.h>
+
+/* The intent is to use get_jited_program_text() for small test
+ * programs written in BPF assembly, thus assume that 32 local labels
+ * would be sufficient.
+ */
+#define MAX_LOCAL_LABELS 32
+
+static bool llvm_initialized;
+
+struct local_labels {
+	bool print_phase;
+	__u32 prog_len;
+	__u32 cnt;
+	__u32 pcs[MAX_LOCAL_LABELS];
+	char names[MAX_LOCAL_LABELS][4];
+};
+
+static const char *lookup_symbol(void *data, uint64_t ref_value, uint64_t *ref_type,
+				 uint64_t ref_pc, const char **ref_name)
+{
+	struct local_labels *labels = data;
+	uint64_t type = *ref_type;
+	int i;
+
+	*ref_type = LLVMDisassembler_ReferenceType_InOut_None;
+	*ref_name = NULL;
+	if (type != LLVMDisassembler_ReferenceType_In_Branch)
+		return NULL;
+	/* Depending on labels->print_phase either discover local labels or
+	 * return a name assigned with local jump target:
+	 * - if print_phase is true and ref_value is in labels->pcs,
+	 *   return corresponding labels->name.
+	 * - if print_phase is false, save program-local jump targets
+	 *   in labels->pcs;
+	 */
+	if (labels->print_phase) {
+		for (i = 0; i < labels->cnt; ++i)
+			if (labels->pcs[i] == ref_value)
+				return labels->names[i];
+	} else {
+		if (labels->cnt < MAX_LOCAL_LABELS && ref_value < labels->prog_len)
+			labels->pcs[labels->cnt++] = ref_value;
+	}
+	return NULL;
+}
+
+static int disasm_insn(LLVMDisasmContextRef ctx, uint8_t *image, __u32 len, __u32 pc,
+		       char *buf, __u32 buf_sz)
+{
+	int i, cnt;
+
+	cnt = LLVMDisasmInstruction(ctx, image + pc, len - pc, pc,
+				    buf, buf_sz);
+	if (cnt > 0)
+		return cnt;
+	PRINT_FAIL("Can't disasm instruction at offset %d:", pc);
+	for (i = 0; i < 16 && pc + i < len; ++i)
+		printf(" %02x", image[pc + i]);
+	printf("\n");
+	return -EINVAL;
+}
+
+static int cmp_u32(const void *_a, const void *_b)
+{
+	__u32 a = *(__u32 *)_a;
+	__u32 b = *(__u32 *)_b;
+
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
+
+static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len)
+{
+	char *label, *colon, *triple = NULL;
+	LLVMDisasmContextRef ctx = NULL;
+	struct local_labels labels = {};
+	__u32 *label_pc, pc;
+	int i, cnt, err = 0;
+	char buf[64];
+
+	triple = LLVMGetDefaultTargetTriple();
+	ctx = LLVMCreateDisasm(triple, &labels, 0, NULL, lookup_symbol);
+	if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasm")) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	cnt = LLVMSetDisasmOptions(ctx, LLVMDisassembler_Option_PrintImmHex);
+	if (!ASSERT_EQ(cnt, 1, "LLVMSetDisasmOptions")) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* discover labels */
+	labels.prog_len = len;
+	pc = 0;
+	while (pc < len) {
+		cnt = disasm_insn(ctx, image, len, pc, buf, 1);
+		if (cnt < 0) {
+			err = cnt;
+			goto out;
+		}
+		pc += cnt;
+	}
+	qsort(labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32);
+	for (i = 0; i < labels.cnt; ++i)
+		/* use (i % 100) to avoid format truncation warning */
+		snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % 100);
+
+	/* now print with labels */
+	labels.print_phase = true;
+	pc = 0;
+	while (pc < len) {
+		cnt = disasm_insn(ctx, image, len, pc, buf, sizeof(buf));
+		if (cnt < 0) {
+			err = cnt;
+			goto out;
+		}
+		label_pc = bsearch(&pc, labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32);
+		label = "";
+		colon = "";
+		if (label_pc) {
+			label = labels.names[label_pc - labels.pcs];
+			colon = ":";
+		}
+		fprintf(text_out, "%x:\t", pc);
+		for (i = 0; i < cnt; ++i)
+			fprintf(text_out, "%02x ", image[pc + i]);
+		for (i = cnt * 3; i < 12 * 3; ++i)
+			fputc(' ', text_out);
+		fprintf(text_out, "%s%s%s\n", label, colon, buf);
+		pc += cnt;
+	}
+
+out:
+	if (triple)
+		LLVMDisposeMessage(triple);
+	if (ctx)
+		LLVMDisasmDispose(ctx);
+	return err;
+}
+
+int get_jited_program_text(int fd, char *text, size_t text_sz)
+{
+	struct bpf_prog_info info = {};
+	__u32 info_len = sizeof(info);
+	__u32 jited_funcs, len, pc;
+	__u32 *func_lens = NULL;
+	FILE *text_out = NULL;
+	uint8_t *image = NULL;
+	int i, err = 0;
+
+	if (!llvm_initialized) {
+		LLVMInitializeAllTargetInfos();
+		LLVMInitializeAllTargetMCs();
+		LLVMInitializeAllDisassemblers();
+		llvm_initialized = 1;
+	}
+
+	text_out = fmemopen(text, text_sz, "w");
+	if (!ASSERT_OK_PTR(text_out, "open_memstream")) {
+		err = -errno;
+		goto out;
+	}
+
+	/* first call is to find out jited program len */
+	err = bpf_prog_get_info_by_fd(fd, &info, &info_len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd #1"))
+		goto out;
+
+	len = info.jited_prog_len;
+	image = malloc(len);
+	if (!ASSERT_OK_PTR(image, "malloc(info.jited_prog_len)")) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	jited_funcs = info.nr_jited_func_lens;
+	func_lens = malloc(jited_funcs * sizeof(__u32));
+	if (!ASSERT_OK_PTR(func_lens, "malloc(info.nr_jited_func_lens)")) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	memset(&info, 0, sizeof(info));
+	info.jited_prog_insns = (__u64)image;
+	info.jited_prog_len = len;
+	info.jited_func_lens = (__u64)func_lens;
+	info.nr_jited_func_lens = jited_funcs;
+	err = bpf_prog_get_info_by_fd(fd, &info, &info_len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd #2"))
+		goto out;
+
+	for (pc = 0, i = 0; i < jited_funcs; ++i) {
+		fprintf(text_out, "func #%d:\n", i);
+		disasm_one_func(text_out, image + pc, func_lens[i]);
+		fprintf(text_out, "\n");
+		pc += func_lens[i];
+	}
+
+out:
+	if (text_out)
+		fclose(text_out);
+	if (image)
+		free(image);
+	if (func_lens)
+		free(func_lens);
+	return err;
+}
+
+#else /* HAVE_LLVM_SUPPORT */
+
+int get_jited_program_text(int fd, char *text, size_t text_sz)
+{
+	if (env.verbosity >= VERBOSE_VERY)
+		printf("compiled w/o llvm development libraries, can't dis-assembly binary code");
+	return -EOPNOTSUPP;
+}
+
+#endif /* HAVE_LLVM_SUPPORT */
diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.h b/tools/testing/selftests/bpf/jit_disasm_helpers.h
new file mode 100644
index 000000000000..e6924fd65ecf
--- /dev/null
+++ b/tools/testing/selftests/bpf/jit_disasm_helpers.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+#ifndef __JIT_DISASM_HELPERS_H
+#define __JIT_DISASM_HELPERS_H
+
+#include <stddef.h>
+
+int get_jited_program_text(int fd, char *text, size_t text_sz);
+
+#endif /* __JIT_DISASM_HELPERS_H */
-- 
cgit v1.2.3


From 7d743e4c759c6bff0131d638158c3472358eda2b Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 20 Aug 2024 03:23:54 -0700
Subject: selftests/bpf: __jited test tag to check disassembly after jit

Allow to verify jit behaviour by writing tests as below:

    SEC("tp")
    __arch_x86_64
    __jited("   endbr64")
    __jited("   nopl    (%rax,%rax)")
    __jited("   xorq    %rax, %rax")
    ...
    __naked void some_test(void)
    {
        asm volatile (... ::: __clobber_all);
    }

Allow regular expressions in patterns, same way as in __msg.
By default assume that each __jited pattern has to be matched on the
next consecutive line of the disassembly, e.g.:

    __jited("   endbr64")             # matched on line N
    __jited("   nopl    (%rax,%rax)") # matched on line N+1

If match occurs on a wrong line an error is reported.
To override this behaviour use __jited("..."), e.g.:

    __jited("   endbr64")             # matched on line N
    __jited("...")                    # not matched
    __jited("   nopl    (%rax,%rax)") # matched on any line >= N

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240820102357.3372779-7-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/bpf_misc.h |  35 +++++++
 tools/testing/selftests/bpf/test_loader.c    | 149 ++++++++++++++++++++++-----
 2 files changed, 161 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index cc3ef20a6490..eccaf955e394 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -36,6 +36,39 @@
  *                   Regular expressions could be specified same way as in __msg.
  * __xlated_unpriv   Same as __xlated but for unprivileged mode.
  *
+ * __jited           Match a line in a disassembly of the jited BPF program.
+ *                   Has to be used after __arch_* macro.
+ *                   For example:
+ *
+ *                       __arch_x86_64
+ *                       __jited("   endbr64")
+ *                       __jited("   nopl    (%rax,%rax)")
+ *                       __jited("   xorq    %rax, %rax")
+ *                       ...
+ *                       __naked void some_test(void)
+ *                       {
+ *                           asm volatile (... ::: __clobber_all);
+ *                       }
+ *
+ *                   Regular expressions could be included in patterns same way
+ *                   as in __msg.
+ *
+ *                   By default assume that each pattern has to be matched on the
+ *                   next consecutive line of disassembly, e.g.:
+ *
+ *                       __jited("   endbr64")             # matched on line N
+ *                       __jited("   nopl    (%rax,%rax)") # matched on line N+1
+ *
+ *                   If match occurs on a wrong line an error is reported.
+ *                   To override this behaviour use literal "...", e.g.:
+ *
+ *                       __jited("   endbr64")             # matched on line N
+ *                       __jited("...")                    # not matched
+ *                       __jited("   nopl    (%rax,%rax)") # matched on any line >= N
+ *
+ * __jited_unpriv    Same as __jited but for unprivileged mode.
+ *
+ *
  * __success         Expect program load success in privileged mode.
  * __success_unpriv  Expect program load success in unprivileged mode.
  *
@@ -76,11 +109,13 @@
  */
 #define __msg(msg)		__attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg)))
 #define __xlated(msg)		__attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg)))
+#define __jited(msg)		__attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg)))
 #define __failure		__attribute__((btf_decl_tag("comment:test_expect_failure")))
 #define __success		__attribute__((btf_decl_tag("comment:test_expect_success")))
 #define __description(desc)	__attribute__((btf_decl_tag("comment:test_description=" desc)))
 #define __msg_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg)))
 #define __xlated_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg)))
+#define __jited_unpriv(msg)	__attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg)))
 #define __failure_unpriv	__attribute__((btf_decl_tag("comment:test_expect_failure_unpriv")))
 #define __success_unpriv	__attribute__((btf_decl_tag("comment:test_expect_success_unpriv")))
 #define __log_level(lvl)	__attribute__((btf_decl_tag("comment:test_log_level="#lvl)))
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index b0d7158e00c1..d588c612ac03 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -10,6 +10,7 @@
 #include "disasm_helpers.h"
 #include "unpriv_helpers.h"
 #include "cap_helpers.h"
+#include "jit_disasm_helpers.h"
 
 #define str_has_pfx(str, pfx) \
 	(strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0)
@@ -33,6 +34,8 @@
 #define TEST_TAG_AUXILIARY_UNPRIV "comment:test_auxiliary_unpriv"
 #define TEST_BTF_PATH "comment:test_btf_path="
 #define TEST_TAG_ARCH "comment:test_arch="
+#define TEST_TAG_JITED_PFX "comment:test_jited="
+#define TEST_TAG_JITED_PFX_UNPRIV "comment:test_jited_unpriv="
 
 /* Warning: duplicated in bpf_misc.h */
 #define POINTER_VALUE	0xcafe4all
@@ -68,6 +71,7 @@ struct test_subspec {
 	bool expect_failure;
 	struct expected_msgs expect_msgs;
 	struct expected_msgs expect_xlated;
+	struct expected_msgs jited;
 	int retval;
 	bool execute;
 };
@@ -124,6 +128,8 @@ static void free_test_spec(struct test_spec *spec)
 	free_msgs(&spec->unpriv.expect_msgs);
 	free_msgs(&spec->priv.expect_xlated);
 	free_msgs(&spec->unpriv.expect_xlated);
+	free_msgs(&spec->priv.jited);
+	free_msgs(&spec->unpriv.jited);
 
 	free(spec->priv.name);
 	free(spec->unpriv.name);
@@ -237,6 +243,21 @@ static int push_msg(const char *substr, struct expected_msgs *msgs)
 	return __push_msg(substr, false, msgs);
 }
 
+static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct expected_msgs *msgs)
+{
+	int err;
+
+	if (strcmp(regex_str, "...") == 0) {
+		*on_next_line = false;
+		return 0;
+	}
+	err = __push_msg(regex_str, *on_next_line, msgs);
+	if (err)
+		return err;
+	*on_next_line = true;
+	return 0;
+}
+
 static int parse_int(const char *str, int *val, const char *name)
 {
 	char *end;
@@ -320,6 +341,18 @@ enum arch {
 	ARCH_RISCV64	= 0x4,
 };
 
+static int get_current_arch(void)
+{
+#if defined(__x86_64__)
+	return ARCH_X86_64;
+#elif defined(__aarch64__)
+	return ARCH_ARM64;
+#elif defined(__riscv) && __riscv_xlen == 64
+	return ARCH_RISCV64;
+#endif
+	return 0;
+}
+
 /* Uses btf_decl_tag attributes to describe the expected test
  * behavior, see bpf_misc.h for detailed description of each attribute
  * and attribute combinations.
@@ -332,9 +365,13 @@ static int parse_test_spec(struct test_loader *tester,
 	const char *description = NULL;
 	bool has_unpriv_result = false;
 	bool has_unpriv_retval = false;
+	bool unpriv_jit_on_next_line;
+	bool jit_on_next_line;
+	bool collect_jit = false;
 	int func_id, i, err = 0;
 	u32 arch_mask = 0;
 	struct btf *btf;
+	enum arch arch;
 
 	memset(spec, 0, sizeof(*spec));
 
@@ -399,6 +436,30 @@ static int parse_test_spec(struct test_loader *tester,
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
+		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_JITED_PFX))) {
+			if (arch_mask == 0) {
+				PRINT_FAIL("__jited used before __arch_*");
+				goto cleanup;
+			}
+			if (collect_jit) {
+				err = push_disasm_msg(msg, &jit_on_next_line,
+						      &spec->priv.jited);
+				if (err)
+					goto cleanup;
+				spec->mode_mask |= PRIV;
+			}
+		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_JITED_PFX_UNPRIV))) {
+			if (arch_mask == 0) {
+				PRINT_FAIL("__unpriv_jited used before __arch_*");
+				goto cleanup;
+			}
+			if (collect_jit) {
+				err = push_disasm_msg(msg, &unpriv_jit_on_next_line,
+						      &spec->unpriv.jited);
+				if (err)
+					goto cleanup;
+				spec->mode_mask |= UNPRIV;
+			}
 		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) {
 			err = push_msg(msg, &spec->priv.expect_xlated);
 			if (err)
@@ -459,16 +520,20 @@ static int parse_test_spec(struct test_loader *tester,
 		} else if (str_has_pfx(s, TEST_TAG_ARCH)) {
 			val = s + sizeof(TEST_TAG_ARCH) - 1;
 			if (strcmp(val, "X86_64") == 0) {
-				arch_mask |= ARCH_X86_64;
+				arch = ARCH_X86_64;
 			} else if (strcmp(val, "ARM64") == 0) {
-				arch_mask |= ARCH_ARM64;
+				arch = ARCH_ARM64;
 			} else if (strcmp(val, "RISCV64") == 0) {
-				arch_mask |= ARCH_RISCV64;
+				arch = ARCH_RISCV64;
 			} else {
 				PRINT_FAIL("bad arch spec: '%s'", val);
 				err = -EINVAL;
 				goto cleanup;
 			}
+			arch_mask |= arch;
+			collect_jit = get_current_arch() == arch;
+			unpriv_jit_on_next_line = true;
+			jit_on_next_line = true;
 		} else if (str_has_pfx(s, TEST_BTF_PATH)) {
 			spec->btf_custom_path = s + sizeof(TEST_BTF_PATH) - 1;
 		}
@@ -521,6 +586,8 @@ static int parse_test_spec(struct test_loader *tester,
 			clone_msgs(&spec->priv.expect_msgs, &spec->unpriv.expect_msgs);
 		if (spec->unpriv.expect_xlated.cnt == 0)
 			clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated);
+		if (spec->unpriv.jited.cnt == 0)
+			clone_msgs(&spec->priv.jited, &spec->unpriv.jited);
 	}
 
 	spec->valid = true;
@@ -575,16 +642,29 @@ static void emit_xlated(const char *xlated, bool force)
 	fprintf(stdout, "XLATED:\n=============\n%s=============\n", xlated);
 }
 
+static void emit_jited(const char *jited, bool force)
+{
+	if (!force && env.verbosity == VERBOSE_NONE)
+		return;
+	fprintf(stdout, "JITED:\n=============\n%s=============\n", jited);
+}
+
 static void validate_msgs(char *log_buf, struct expected_msgs *msgs,
 			  void (*emit_fn)(const char *buf, bool force))
 {
+	const char *log = log_buf, *prev_match;
 	regmatch_t reg_match[1];
-	const char *log = log_buf;
+	int prev_match_line;
+	int match_line;
 	int i, j, err;
 
+	prev_match_line = -1;
+	match_line = 0;
+	prev_match = log;
 	for (i = 0; i < msgs->cnt; i++) {
 		struct expect_msg *msg = &msgs->patterns[i];
-		const char *match = NULL;
+		const char *match = NULL, *pat_status;
+		bool wrong_line = false;
 
 		if (!msg->is_regex) {
 			match = strstr(log, msg->substr);
@@ -598,19 +678,41 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs,
 			}
 		}
 
-		if (!match) {
+		if (match) {
+			for (; prev_match < match; ++prev_match)
+				if (*prev_match == '\n')
+					++match_line;
+			wrong_line = msg->on_next_line && prev_match_line >= 0 &&
+				     prev_match_line + 1 != match_line;
+		}
+
+		if (!match || wrong_line) {
 			PRINT_FAIL("expect_msg\n");
 			if (env.verbosity == VERBOSE_NONE)
 				emit_fn(log_buf, true /*force*/);
 			for (j = 0; j <= i; j++) {
 				msg = &msgs->patterns[j];
+				if (j < i)
+					pat_status = "MATCHED   ";
+				else if (wrong_line)
+					pat_status = "WRONG LINE";
+				else
+					pat_status = "EXPECTED  ";
+				msg = &msgs->patterns[j];
 				fprintf(stderr, "%s %s: '%s'\n",
-					j < i ? "MATCHED " : "EXPECTED",
+					pat_status,
 					msg->is_regex ? " REGEX" : "SUBSTR",
 					msg->substr);
 			}
-			return;
+			if (wrong_line) {
+				fprintf(stderr,
+					"expecting match at line %d, actual match is at line %d\n",
+					prev_match_line + 1, match_line);
+			}
+			break;
 		}
+
+		prev_match_line = match_line;
 	}
 }
 
@@ -769,20 +871,6 @@ out:
 	return err;
 }
 
-static bool run_on_current_arch(int arch_mask)
-{
-	if (arch_mask == 0)
-		return true;
-#if defined(__x86_64__)
-	return arch_mask & ARCH_X86_64;
-#elif defined(__aarch64__)
-	return arch_mask & ARCH_ARM64;
-#elif defined(__riscv) && __riscv_xlen == 64
-	return arch_mask & ARCH_RISCV64;
-#endif
-	return false;
-}
-
 /* this function is forced noinline and has short generic name to look better
  * in test_progs output (in case of a failure)
  */
@@ -807,7 +895,7 @@ void run_subtest(struct test_loader *tester,
 	if (!test__start_subtest(subspec->name))
 		return;
 
-	if (!run_on_current_arch(spec->arch_mask)) {
+	if ((get_current_arch() & spec->arch_mask) == 0) {
 		test__skip();
 		return;
 	}
@@ -884,6 +972,21 @@ void run_subtest(struct test_loader *tester,
 		validate_msgs(tester->log_buf, &subspec->expect_xlated, emit_xlated);
 	}
 
+	if (subspec->jited.cnt) {
+		err = get_jited_program_text(bpf_program__fd(tprog),
+					     tester->log_buf, tester->log_buf_sz);
+		if (err == -EOPNOTSUPP) {
+			printf("%s:SKIP: jited programs disassembly is not supported,\n", __func__);
+			printf("%s:SKIP: tests are built w/o LLVM development libs\n", __func__);
+			test__skip();
+			goto tobj_cleanup;
+		}
+		if (!ASSERT_EQ(err, 0, "get_jited_program_text"))
+			goto tobj_cleanup;
+		emit_jited(tester->log_buf, false /*force*/);
+		validate_msgs(tester->log_buf, &subspec->jited, emit_jited);
+	}
+
 	if (should_do_test_run(spec, subspec)) {
 		/* For some reason test_verifier executes programs
 		 * with all capabilities restored. Do the same here.
-- 
cgit v1.2.3


From e5bdd6a8be783eb0c960723d9f37df7ee931d6d6 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 20 Aug 2024 03:23:56 -0700
Subject: selftests/bpf: validate jit behaviour for tail calls

A program calling sub-program which does a tail call.
The idea is to verify instructions generated by jit for tail calls:
- in program and sub-program prologues;
- for subprogram call instruction;
- for tail call itself.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240820102357.3372779-9-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |   2 +
 .../selftests/bpf/progs/verifier_tailcall_jit.c    | 105 +++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index f8f546eba488..cf3662dbd24f 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -75,6 +75,7 @@
 #include "verifier_stack_ptr.skel.h"
 #include "verifier_subprog_precision.skel.h"
 #include "verifier_subreg.skel.h"
+#include "verifier_tailcall_jit.skel.h"
 #include "verifier_typedef.skel.h"
 #include "verifier_uninit.skel.h"
 #include "verifier_unpriv.skel.h"
@@ -198,6 +199,7 @@ void test_verifier_spin_lock(void)            { RUN(verifier_spin_lock); }
 void test_verifier_stack_ptr(void)            { RUN(verifier_stack_ptr); }
 void test_verifier_subprog_precision(void)    { RUN(verifier_subprog_precision); }
 void test_verifier_subreg(void)               { RUN(verifier_subreg); }
+void test_verifier_tailcall_jit(void)         { RUN(verifier_tailcall_jit); }
 void test_verifier_typedef(void)              { RUN(verifier_typedef); }
 void test_verifier_uninit(void)               { RUN(verifier_uninit); }
 void test_verifier_unpriv(void)               { RUN(verifier_unpriv); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c
new file mode 100644
index 000000000000..06d327cf1e1f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+int main(void);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__array(values, void (void));
+} jmp_table SEC(".maps") = {
+	.values = {
+		[0] = (void *) &main,
+	},
+};
+
+__noinline __auxiliary
+static __naked int sub(void)
+{
+	asm volatile (
+	"r2 = %[jmp_table] ll;"
+	"r3 = 0;"
+	"call 12;"
+	"exit;"
+	:
+	: __imm_addr(jmp_table)
+	: __clobber_all);
+}
+
+__success
+__arch_x86_64
+/* program entry for main(), regular function prologue */
+__jited("	endbr64")
+__jited("	nopl	(%rax,%rax)")
+__jited("	xorq	%rax, %rax")
+__jited("	pushq	%rbp")
+__jited("	movq	%rsp, %rbp")
+/* tail call prologue for program:
+ * - establish memory location for tail call counter at &rbp[-8];
+ * - spill tail_call_cnt_ptr at &rbp[-16];
+ * - expect tail call counter to be passed in rax;
+ * - for entry program rax is a raw counter, value < 33;
+ * - for tail called program rax is tail_call_cnt_ptr (value > 33).
+ */
+__jited("	endbr64")
+__jited("	cmpq	$0x21, %rax")
+__jited("	ja	L0")
+__jited("	pushq	%rax")
+__jited("	movq	%rsp, %rax")
+__jited("	jmp	L1")
+__jited("L0:	pushq	%rax")			/* rbp[-8]  = rax         */
+__jited("L1:	pushq	%rax")			/* rbp[-16] = rax         */
+/* on subprogram call restore rax to be tail_call_cnt_ptr from rbp[-16]
+ * (cause original rax might be clobbered by this point)
+ */
+__jited("	movq	-0x10(%rbp), %rax")
+__jited("	callq	0x{{.*}}")		/* call to sub()          */
+__jited("	xorl	%eax, %eax")
+__jited("	leave")
+__jited("	retq")
+__jited("...")
+/* subprogram entry for sub(), regular function prologue */
+__jited("	endbr64")
+__jited("	nopl	(%rax,%rax)")
+__jited("	nopl	(%rax)")
+__jited("	pushq	%rbp")
+__jited("	movq	%rsp, %rbp")
+/* tail call prologue for subprogram address of tail call counter
+ * stored at rbp[-16].
+ */
+__jited("	endbr64")
+__jited("	pushq	%rax")			/* rbp[-8]  = rax          */
+__jited("	pushq	%rax")			/* rbp[-16] = rax          */
+__jited("	movabsq	${{.*}}, %rsi")		/* r2 = &jmp_table         */
+__jited("	xorl	%edx, %edx")		/* r3 = 0                  */
+/* bpf_tail_call implementation:
+ * - load tail_call_cnt_ptr from rbp[-16];
+ * - if *tail_call_cnt_ptr < 33, increment it and jump to target;
+ * - otherwise do nothing.
+ */
+__jited("	movq	-0x10(%rbp), %rax")
+__jited("	cmpq	$0x21, (%rax)")
+__jited("	jae	L0")
+__jited("	nopl	(%rax,%rax)")
+__jited("	addq	$0x1, (%rax)")		/* *tail_call_cnt_ptr += 1 */
+__jited("	popq	%rax")
+__jited("	popq	%rax")
+__jited("	jmp	{{.*}}")		/* jump to tail call tgt   */
+__jited("L0:	leave")
+__jited("	retq")
+SEC("tc")
+__naked int main(void)
+{
+	asm volatile (
+	"call %[sub];"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(sub)
+	: __clobber_all);
+}
+
+char __license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From a038eacdbf59e6024c9e8da5df7f293e64cf20de Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 20 Aug 2024 03:23:57 -0700
Subject: selftests/bpf: validate __xlated same way as __jited

Both __xlated and __jited work with disassembly.
It is logical to have both work in a similar manner.

This commit updates __xlated macro handling in test_loader.c by making
it expect matches on sequential lines, same way as __jited operates.
For example:

    __xlated("1: *(u64 *)(r10 -16) = r1")      ;; matched on line N
    __xlated("3: r0 = &(void __percpu *)(r0)") ;; matched on line N+1

Also:

    __xlated("1: *(u64 *)(r10 -16) = r1")      ;; matched on line N
    __xlated("...")                            ;; not matched
    __xlated("3: r0 = &(void __percpu *)(r0)") ;; mantched on any
                                               ;; line >= N

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240820102357.3372779-10-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_nocsr.c | 53 +++++++++++++++++++++-
 tools/testing/selftests/bpf/test_loader.c          |  8 +++-
 2 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_nocsr.c
index a7fe277e5167..666c736d196f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_nocsr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_nocsr.c
@@ -78,6 +78,7 @@ __naked void canary_arm64_riscv64(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("3: exit")
 __success
 __naked void canary_zero_spills(void)
@@ -94,7 +95,9 @@ SEC("raw_tp")
 __arch_x86_64
 __log_level(4) __msg("stack depth 16")
 __xlated("1: *(u64 *)(r10 -16) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r2 = *(u64 *)(r10 -16)")
 __success
 __naked void wrong_reg_in_pattern1(void)
@@ -113,7 +116,9 @@ __naked void wrong_reg_in_pattern1(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u64 *)(r10 -16) = r6")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r6 = *(u64 *)(r10 -16)")
 __success
 __naked void wrong_reg_in_pattern2(void)
@@ -132,7 +137,9 @@ __naked void wrong_reg_in_pattern2(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u64 *)(r10 -16) = r0")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r0 = *(u64 *)(r10 -16)")
 __success
 __naked void wrong_reg_in_pattern3(void)
@@ -151,7 +158,9 @@ __naked void wrong_reg_in_pattern3(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("2: *(u64 *)(r2 -16) = r1")
+__xlated("...")
 __xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("6: r1 = *(u64 *)(r10 -16)")
 __success
 __naked void wrong_base_in_pattern(void)
@@ -171,7 +180,9 @@ __naked void wrong_base_in_pattern(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u64 *)(r10 -16) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r2 = 1")
 __success
 __naked void wrong_insn_in_pattern(void)
@@ -191,7 +202,9 @@ __naked void wrong_insn_in_pattern(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("2: *(u64 *)(r10 -16) = r1")
+__xlated("...")
 __xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("6: r1 = *(u64 *)(r10 -8)")
 __success
 __naked void wrong_off_in_pattern1(void)
@@ -211,7 +224,9 @@ __naked void wrong_off_in_pattern1(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u32 *)(r10 -4) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r1 = *(u32 *)(r10 -4)")
 __success
 __naked void wrong_off_in_pattern2(void)
@@ -230,7 +245,9 @@ __naked void wrong_off_in_pattern2(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u32 *)(r10 -16) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r1 = *(u32 *)(r10 -16)")
 __success
 __naked void wrong_size_in_pattern(void)
@@ -249,7 +266,9 @@ __naked void wrong_size_in_pattern(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("2: *(u32 *)(r10 -8) = r1")
+__xlated("...")
 __xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("6: r1 = *(u32 *)(r10 -8)")
 __success
 __naked void partial_pattern(void)
@@ -275,11 +294,15 @@ __xlated("1: r2 = 2")
 /* not patched, spills for -8, -16 not removed */
 __xlated("2: *(u64 *)(r10 -8) = r1")
 __xlated("3: *(u64 *)(r10 -16) = r2")
+__xlated("...")
 __xlated("5: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("7: r2 = *(u64 *)(r10 -16)")
 __xlated("8: r1 = *(u64 *)(r10 -8)")
 /* patched, spills for -24, -32 removed */
+__xlated("...")
 __xlated("10: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("12: exit")
 __success
 __naked void min_stack_offset(void)
@@ -308,7 +331,9 @@ __naked void min_stack_offset(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r1 = *(u64 *)(r10 -8)")
 __success
 __naked void bad_fixed_read(void)
@@ -330,7 +355,9 @@ __naked void bad_fixed_read(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r1 = *(u64 *)(r10 -8)")
 __success
 __naked void bad_fixed_write(void)
@@ -352,7 +379,9 @@ __naked void bad_fixed_write(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("6: *(u64 *)(r10 -16) = r1")
+__xlated("...")
 __xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("10: r1 = *(u64 *)(r10 -16)")
 __success
 __naked void bad_varying_read(void)
@@ -379,7 +408,9 @@ __naked void bad_varying_read(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("6: *(u64 *)(r10 -16) = r1")
+__xlated("...")
 __xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("10: r1 = *(u64 *)(r10 -16)")
 __success
 __naked void bad_varying_write(void)
@@ -406,7 +437,9 @@ __naked void bad_varying_write(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r1 = *(u64 *)(r10 -8)")
 __success
 __naked void bad_write_in_subprog(void)
@@ -438,7 +471,9 @@ __naked static void bad_write_in_subprog_aux(void)
 SEC("raw_tp")
 __arch_x86_64
 __xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r1 = *(u64 *)(r10 -8)")
 __success
 __naked void bad_helper_write(void)
@@ -466,13 +501,19 @@ SEC("raw_tp")
 __arch_x86_64
 /* main, not patched */
 __xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("5: r1 = *(u64 *)(r10 -8)")
+__xlated("...")
 __xlated("9: call pc+1")
+__xlated("...")
 __xlated("10: exit")
 /* subprogram, patched */
 __xlated("11: r1 = 1")
+__xlated("...")
 __xlated("13: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("15: exit")
 __success
 __naked void invalidate_one_subprog(void)
@@ -510,12 +551,16 @@ SEC("raw_tp")
 __arch_x86_64
 /* main */
 __xlated("0: r1 = 1")
+__xlated("...")
 __xlated("2: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("4: call pc+1")
 __xlated("5: exit")
 /* subprogram */
 __xlated("6: r1 = 1")
+__xlated("...")
 __xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 __xlated("10: *(u64 *)(r10 -16) = r1")
 __xlated("11: exit")
 __success
@@ -576,7 +621,9 @@ __log_level(4) __msg("stack depth 16")
 /* may_goto counter at -16 */
 __xlated("0: *(u64 *)(r10 -16) =")
 __xlated("1: r1 = 1")
+__xlated("...")
 __xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
 /* may_goto expansion starts */
 __xlated("5: r11 = *(u64 *)(r10 -16)")
 __xlated("6: if r11 == 0x0 goto pc+3")
@@ -623,13 +670,15 @@ __xlated("5: r0 = *(u32 *)(r0 +0)")
 __xlated("6: r2 =")
 __xlated("7: r3 = 0")
 __xlated("8: r4 = 0")
+__xlated("...")
 /* ... part of the inlined bpf_loop */
 __xlated("12: *(u64 *)(r10 -32) = r6")
 __xlated("13: *(u64 *)(r10 -24) = r7")
 __xlated("14: *(u64 *)(r10 -16) = r8")
-/* ... */
+__xlated("...")
 __xlated("21: call pc+8") /* dummy_loop_callback */
 /* ... last insns of the bpf_loop_interaction1 */
+__xlated("...")
 __xlated("28: r0 = 0")
 __xlated("29: exit")
 /* dummy_loop_callback */
@@ -670,7 +719,7 @@ __xlated("5: r0 = *(u32 *)(r0 +0)")
 __xlated("6: *(u64 *)(r10 -16) = r1")
 __xlated("7: call")
 __xlated("8: r1 = *(u64 *)(r10 -16)")
-/* ... */
+__xlated("...")
 /* ... part of the inlined bpf_loop */
 __xlated("15: *(u64 *)(r10 -40) = r6")
 __xlated("16: *(u64 *)(r10 -32) = r7")
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index d588c612ac03..b229dd013355 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -365,6 +365,8 @@ static int parse_test_spec(struct test_loader *tester,
 	const char *description = NULL;
 	bool has_unpriv_result = false;
 	bool has_unpriv_retval = false;
+	bool unpriv_xlated_on_next_line = true;
+	bool xlated_on_next_line = true;
 	bool unpriv_jit_on_next_line;
 	bool jit_on_next_line;
 	bool collect_jit = false;
@@ -461,12 +463,14 @@ static int parse_test_spec(struct test_loader *tester,
 				spec->mode_mask |= UNPRIV;
 			}
 		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) {
-			err = push_msg(msg, &spec->priv.expect_xlated);
+			err = push_disasm_msg(msg, &xlated_on_next_line,
+					      &spec->priv.expect_xlated);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= PRIV;
 		} else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) {
-			err = push_msg(msg, &spec->unpriv.expect_xlated);
+			err = push_disasm_msg(msg, &unpriv_xlated_on_next_line,
+					      &spec->unpriv.expect_xlated);
 			if (err)
 				goto cleanup;
 			spec->mode_mask |= UNPRIV;
-- 
cgit v1.2.3


From f32c821ae0198cf43181711efb18376b2eb6a1cb Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 20 Aug 2024 17:12:22 +0200
Subject: tools: ynl: lift an assumption about spec file name

Currently the parsing code generator assumes that the yaml
specification file name and the main 'name' attribute carried
inside correspond, that is the field is the c-name representation
of the file basename.

The above assumption held true within the current tree, but will be
hopefully broken soon by the upcoming net shaper specification.
Additionally, it makes the field 'name' itself useless.

Lift the assumption, always computing the generated include file
name from the generated c file name.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/24da5a3596d814beeb12bd7139a6b4f89756cc19.1724165948.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 51529fabd517..717530bc9c52 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2668,13 +2668,15 @@ def main():
         cw.p('#define ' + hdr_prot)
         cw.nl()
 
+    hdr_file=os.path.basename(args.out_file[:-2]) + ".h"
+
     if args.mode == 'kernel':
         cw.p('#include <net/netlink.h>')
         cw.p('#include <net/genetlink.h>')
         cw.nl()
         if not args.header:
             if args.out_file:
-                cw.p(f'#include "{os.path.basename(args.out_file[:-2])}.h"')
+                cw.p(f'#include "{hdr_file}"')
             cw.nl()
         headers = ['uapi/' + parsed.uapi_header]
         headers += parsed.kernel_family.get('headers', [])
@@ -2686,7 +2688,7 @@ def main():
             if family_contains_bitfield32(parsed):
                 cw.p('#include <linux/netlink.h>')
         else:
-            cw.p(f'#include "{parsed.name}-user.h"')
+            cw.p(f'#include "{hdr_file}"')
             cw.p('#include "ynl.h"')
         headers = [parsed.uapi_header]
     for definition in parsed['definitions']:
-- 
cgit v1.2.3


From af8a066f1c473261881a6d8e2b55cca8eda9ce80 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 20 Aug 2024 18:34:25 -0700
Subject: selftest: bpf: Remove mssind boundary check in
 test_tcp_custom_syncookie.c.

Smatch reported a possible off-by-one in tcp_validate_cookie().

However, it's false positive because the possible range of mssind is
limited from 0 to 3 by the preceding calculation.

  mssind = (cookie & (3 << 6)) >> 6;

Now, the verifier does not complain without the boundary check.
Let's remove the checks.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/bpf/6ae12487-d3f1-488b-9514-af0dac96608f@stanley.mountain/
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240821013425.49316-1-kuniyu@amazon.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c
index 44ee0d037f95..eb5cca1fce16 100644
--- a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c
+++ b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c
@@ -486,17 +486,10 @@ static int tcp_validate_cookie(struct tcp_syncookie *ctx)
 		goto err;
 
 	mssind = (cookie & (3 << 6)) >> 6;
-	if (ctx->ipv4) {
-		if (mssind > ARRAY_SIZE(msstab4))
-			goto err;
-
+	if (ctx->ipv4)
 		ctx->attrs.mss = msstab4[mssind];
-	} else {
-		if (mssind > ARRAY_SIZE(msstab6))
-			goto err;
-
+	else
 		ctx->attrs.mss = msstab6[mssind];
-	}
 
 	ctx->attrs.snd_wscale = cookie & BPF_SYNCOOKIE_WSCALE_MASK;
 	ctx->attrs.rcv_wscale = ctx->attrs.snd_wscale;
-- 
cgit v1.2.3


From 611fbeb44a777e5ab54ab3127ec85f72147911d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 22 Aug 2024 05:39:32 +0100
Subject: selftests:core: test coverage for dup_fd() failure handling in
 unshare_fd()

At some point there'd been a dumb braino during the dup_fd()
calling conventions change; caught by smatch and immediately fixed.
The trouble is, there had been no test coverage for the dup_fd() failure
handling - neither in kselftests nor in LTP.  Fortunately, it can be
triggered on stock kernel - ENOMEM would require fault injection, but
EMFILE can be had with sysctl alone (fs.nr_open).

Add a test for dup_fd() failure.
Fixed up commit log and short log - Shuah Khan

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/core/Makefile       |  2 +-
 tools/testing/selftests/core/unshare_test.c | 94 +++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/core/unshare_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile
index ce262d097269..8e99f87f5d7c 100644
--- a/tools/testing/selftests/core/Makefile
+++ b/tools/testing/selftests/core/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 CFLAGS += -g $(KHDR_INCLUDES)
 
-TEST_GEN_PROGS := close_range_test
+TEST_GEN_PROGS := close_range_test unshare_test
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/core/unshare_test.c b/tools/testing/selftests/core/unshare_test.c
new file mode 100644
index 000000000000..7fec9dfb1b0e
--- /dev/null
+++ b/tools/testing/selftests/core/unshare_test.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/kernel.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <sys/resource.h>
+#include <linux/close_range.h>
+
+#include "../kselftest_harness.h"
+#include "../clone3/clone3_selftests.h"
+
+TEST(unshare_EMFILE)
+{
+	pid_t pid;
+	int status;
+	struct __clone_args args = {
+		.flags = CLONE_FILES,
+		.exit_signal = SIGCHLD,
+	};
+	int fd;
+	ssize_t n, n2;
+	static char buf[512], buf2[512];
+	struct rlimit rlimit;
+	int nr_open;
+
+	fd = open("/proc/sys/fs/nr_open", O_RDWR);
+	ASSERT_GE(fd, 0);
+
+	n = read(fd, buf, sizeof(buf));
+	ASSERT_GT(n, 0);
+	ASSERT_EQ(buf[n - 1], '\n');
+
+	ASSERT_EQ(sscanf(buf, "%d", &nr_open), 1);
+
+	ASSERT_EQ(0, getrlimit(RLIMIT_NOFILE, &rlimit));
+
+	/* bump fs.nr_open */
+	n2 = sprintf(buf2, "%d\n", nr_open + 1024);
+	lseek(fd, 0, SEEK_SET);
+	write(fd, buf2, n2);
+
+	/* bump ulimit -n */
+	rlimit.rlim_cur = nr_open + 1024;
+	rlimit.rlim_max = nr_open + 1024;
+	EXPECT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlimit)) {
+		lseek(fd, 0, SEEK_SET);
+		write(fd, buf, n);
+		exit(EXIT_FAILURE);
+	}
+
+	/* get a descriptor past the old fs.nr_open */
+	EXPECT_GE(dup2(2, nr_open + 64), 0) {
+		lseek(fd, 0, SEEK_SET);
+		write(fd, buf, n);
+		exit(EXIT_FAILURE);
+	}
+
+	/* get descriptor table shared */
+	pid = sys_clone3(&args, sizeof(args));
+	EXPECT_GE(pid, 0) {
+		lseek(fd, 0, SEEK_SET);
+		write(fd, buf, n);
+		exit(EXIT_FAILURE);
+	}
+
+	if (pid == 0) {
+		int err;
+
+		/* restore fs.nr_open */
+		lseek(fd, 0, SEEK_SET);
+		write(fd, buf, n);
+		/* ... and now unshare(CLONE_FILES) must fail with EMFILE */
+		err = unshare(CLONE_FILES);
+		EXPECT_EQ(err, -1)
+			exit(EXIT_FAILURE);
+		EXPECT_EQ(errno, EMFILE)
+			exit(EXIT_FAILURE);
+		exit(EXIT_SUCCESS);
+	}
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From bcc3773c49af2426093645473fe02a5a8e765587 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 20 Aug 2024 18:22:27 -0700
Subject: selftests: net: add helper for checking if nettest is available

A few tests check if nettest exists in the $PATH before adding
$PWD to $PATH and re-checking. They don't discard stderr on
the first check (and nettest is built as part of selftests,
so it's pretty normal for it to not be available in system $PATH).
This leads to output noise:

  which: no nettest in (/home/virtme/tools/fs/bin:/home/virtme/tools/fs/sbin:/home/virtme/tools/fs/usr/bin:/home/virtme/tools/fs/usr/sbin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin)

Add a common helper for the check which does silence stderr.

There is another small functional change hiding here, because pmtu.sh
and fib_rule_tests.sh used to return from the test case rather than
completely exit. Building nettest is not hard, there should be no need
to maintain the ability to selectively skip cases in its absence.

Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/20240821012227.1398769-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/fcnal-test.sh         |  9 +-----
 tools/testing/selftests/net/fib_rule_tests.sh     | 37 ++---------------------
 tools/testing/selftests/net/lib.sh                | 15 +++++++++
 tools/testing/selftests/net/pmtu.sh               |  8 +----
 tools/testing/selftests/net/unicast_extensions.sh |  9 +-----
 tools/testing/selftests/net/vrf_route_leaking.sh  |  3 +-
 6 files changed, 22 insertions(+), 59 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 386ebd829df5..899dbad0104b 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -4304,14 +4304,7 @@ elif [ "$TESTS" = "ipv6" ]; then
 	TESTS="$TESTS_IPV6"
 fi
 
-# nettest can be run from PATH or from same directory as this selftest
-if ! which nettest >/dev/null; then
-	PATH=$PWD:$PATH
-	if ! which nettest >/dev/null; then
-		echo "'nettest' command not found; skipping tests"
-		exit $ksft_skip
-	fi
-fi
+check_gen_prog "nettest"
 
 declare -i nfail=0
 declare -i nsuccess=0
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 89034c5b69dc..53c5c1ad437e 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -51,31 +51,6 @@ log_test()
 	fi
 }
 
-check_nettest()
-{
-	if which nettest > /dev/null 2>&1; then
-		return 0
-	fi
-
-	# Add the selftest directory to PATH if not already done
-	if [ "${SELFTEST_PATH}" = "" ]; then
-		SELFTEST_PATH="$(dirname $0)"
-		PATH="${PATH}:${SELFTEST_PATH}"
-
-		# Now retry with the new path
-		if which nettest > /dev/null 2>&1; then
-			return 0
-		fi
-
-		if [ "${ret}" -eq 0 ]; then
-			ret="${ksft_skip}"
-		fi
-		echo "nettest not found (try 'make -C ${SELFTEST_PATH} nettest')"
-	fi
-
-	return 1
-}
-
 setup()
 {
 	set -e
@@ -317,11 +292,6 @@ fib_rule6_connect_test()
 	echo
 	echo "IPv6 FIB rule connect tests"
 
-	if ! check_nettest; then
-		echo "SKIP: Could not run test without nettest tool"
-		return
-	fi
-
 	setup_peer
 	$IP -6 rule add dsfield 0x04 table $RTABLE_PEER
 
@@ -516,11 +486,6 @@ fib_rule4_connect_test()
 	echo
 	echo "IPv4 FIB rule connect tests"
 
-	if ! check_nettest; then
-		echo "SKIP: Could not run test without nettest tool"
-		return
-	fi
-
 	setup_peer
 	$IP -4 rule add dsfield 0x04 table $RTABLE_PEER
 
@@ -584,6 +549,8 @@ if [ ! -x "$(command -v ip)" ]; then
 	exit $ksft_skip
 fi
 
+check_gen_prog "nettest"
+
 # start clean
 cleanup &> /dev/null
 setup
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 8ee4489238ca..be8707bfb46e 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -125,6 +125,21 @@ slowwait_for_counter()
 	slowwait "$timeout" until_counter_is ">= $((base + delta))" "$@"
 }
 
+# Check for existence of tools which are built as part of selftests
+# but may also already exist in $PATH
+check_gen_prog()
+{
+	local prog_name=$1; shift
+
+	if ! which $prog_name >/dev/null 2>/dev/null; then
+		PATH=$PWD:$PATH
+		if ! which $prog_name >/dev/null; then
+			echo "'$prog_name' command not found; skipping tests"
+			exit $ksft_skip
+		fi
+	fi
+}
+
 remove_ns_list()
 {
 	local item=$1
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 24a50622406c..569bce8b6383 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -681,13 +681,7 @@ setup_xfrm() {
 }
 
 setup_nettest_xfrm() {
-	if ! which nettest >/dev/null; then
-		PATH=$PWD:$PATH
-		if ! which nettest >/dev/null; then
-			echo "'nettest' command not found; skipping tests"
-			return 1
-		fi
-	fi
+	check_gen_prog "nettest"
 
 	[ ${1} -eq 6 ] && proto="-6" || proto=""
 	port=${2}
diff --git a/tools/testing/selftests/net/unicast_extensions.sh b/tools/testing/selftests/net/unicast_extensions.sh
index f52aa5f7da52..3e751234ccfe 100755
--- a/tools/testing/selftests/net/unicast_extensions.sh
+++ b/tools/testing/selftests/net/unicast_extensions.sh
@@ -30,14 +30,7 @@
 
 source lib.sh
 
-# nettest can be run from PATH or from same directory as this selftest
-if ! which nettest >/dev/null; then
-	PATH=$PWD:$PATH
-	if ! which nettest >/dev/null; then
-		echo "'nettest' command not found; skipping tests"
-		exit $ksft_skip
-	fi
-fi
+check_gen_prog "nettest"
 
 result=0
 
diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh
index 152171fb1fc8..e9c2f71da207 100755
--- a/tools/testing/selftests/net/vrf_route_leaking.sh
+++ b/tools/testing/selftests/net/vrf_route_leaking.sh
@@ -59,7 +59,6 @@
 # while it is forwarded between different vrfs.
 
 source lib.sh
-PATH=$PWD:$PWD/tools/testing/selftests/net:$PATH
 VERBOSE=0
 PAUSE_ON_FAIL=no
 DEFAULT_TTYPE=sym
@@ -636,6 +635,8 @@ EOF
 # Some systems don't have a ping6 binary anymore
 command -v ping6 > /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping)
 
+check_gen_prog "nettest"
+
 TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_local ipv4_tcp_local
 ipv4_udp_local ipv4_ping_ttl_asym ipv4_traceroute_asym"
 TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_local ipv6_tcp_local ipv6_udp_local
-- 
cgit v1.2.3


From 110bbd3a2ed71f3cf4d9438d62f22f591952fc3b Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 22 Aug 2024 01:01:24 -0700
Subject: selftests/bpf: test for malformed BPF_CORE_TYPE_ID_LOCAL relocation

Check that verifier rejects BPF program containing relocation
pointing to non-existent BTF type.

To force relocation resolution on kernel side test case uses
bpf_attr->core_relos field. This field is not exposed by libbpf,
so directly do BPF system call in the test.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240822080124.2995724-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/core_reloc_raw.c      | 125 +++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c b/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c
new file mode 100644
index 000000000000..a18d3680fb16
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Test cases that can't load programs using libbpf and need direct
+ * BPF syscall access
+ */
+
+#include <sys/syscall.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+
+#include "test_progs.h"
+#include "test_btf.h"
+#include "bpf/libbpf_internal.h"
+
+static char log[16 * 1024];
+
+/* Check that verifier rejects BPF program containing relocation
+ * pointing to non-existent BTF type.
+ */
+static void test_bad_local_id(void)
+{
+	struct test_btf {
+		struct btf_header hdr;
+		__u32 types[15];
+		char strings[128];
+	} raw_btf = {
+		.hdr = {
+			.magic = BTF_MAGIC,
+			.version = BTF_VERSION,
+			.hdr_len = sizeof(struct btf_header),
+			.type_off = 0,
+			.type_len = sizeof(raw_btf.types),
+			.str_off = offsetof(struct test_btf, strings) -
+				   offsetof(struct test_btf, types),
+			.str_len = sizeof(raw_btf.strings),
+		},
+		.types = {
+			BTF_PTR_ENC(0),					/* [1] void*  */
+			BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),	/* [2] int    */
+			BTF_FUNC_PROTO_ENC(2, 1),			/* [3] int (*)(void*) */
+			BTF_FUNC_PROTO_ARG_ENC(8, 1),
+			BTF_FUNC_ENC(8, 3)			/* [4] FUNC 'foo' type_id=2   */
+		},
+		.strings = "\0int\0 0\0foo\0"
+	};
+	__u32 log_level = 1 | 2 | 4;
+	LIBBPF_OPTS(bpf_btf_load_opts, opts,
+		    .log_buf = log,
+		    .log_size = sizeof(log),
+		    .log_level = log_level,
+	);
+	struct bpf_insn insns[] = {
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	struct bpf_func_info funcs[] = {
+		{
+			.insn_off = 0,
+			.type_id = 4,
+		}
+	};
+	struct bpf_core_relo relos[] = {
+		{
+			.insn_off = 0,		/* patch first instruction (r0 = 0) */
+			.type_id = 100500,	/* !!! this type id does not exist */
+			.access_str_off = 6,	/* offset of "0" */
+			.kind = BPF_CORE_TYPE_ID_LOCAL,
+		}
+	};
+	union bpf_attr attr;
+	int saved_errno;
+	int prog_fd = -1;
+	int btf_fd = -1;
+
+	btf_fd = bpf_btf_load(&raw_btf, sizeof(raw_btf), &opts);
+	saved_errno = errno;
+	if (btf_fd < 0 || env.verbosity > VERBOSE_NORMAL) {
+		printf("-------- BTF load log start --------\n");
+		printf("%s", log);
+		printf("-------- BTF load log end ----------\n");
+	}
+	if (btf_fd < 0) {
+		PRINT_FAIL("bpf_btf_load() failed, errno=%d\n", saved_errno);
+		return;
+	}
+
+	log[0] = 0;
+	memset(&attr, 0, sizeof(attr));
+	attr.prog_btf_fd = btf_fd;
+	attr.prog_type = BPF_TRACE_RAW_TP;
+	attr.license = (__u64)"GPL";
+	attr.insns = (__u64)&insns;
+	attr.insn_cnt = sizeof(insns) / sizeof(*insns);
+	attr.log_buf = (__u64)log;
+	attr.log_size = sizeof(log);
+	attr.log_level = log_level;
+	attr.func_info = (__u64)funcs;
+	attr.func_info_cnt = sizeof(funcs) / sizeof(*funcs);
+	attr.func_info_rec_size = sizeof(*funcs);
+	attr.core_relos = (__u64)relos;
+	attr.core_relo_cnt = sizeof(relos) / sizeof(*relos);
+	attr.core_relo_rec_size = sizeof(*relos);
+	prog_fd = sys_bpf_prog_load(&attr, sizeof(attr), 1);
+	saved_errno = errno;
+	if (prog_fd < 0 || env.verbosity > VERBOSE_NORMAL) {
+		printf("-------- program load log start --------\n");
+		printf("%s", log);
+		printf("-------- program load log end ----------\n");
+	}
+	if (prog_fd >= 0) {
+		PRINT_FAIL("sys_bpf_prog_load() expected to fail\n");
+		goto out;
+	}
+	ASSERT_HAS_SUBSTR(log, "relo #0: bad type id 100500", "program load log");
+
+out:
+	close(prog_fd);
+	close(btf_fd);
+}
+
+void test_core_reloc_raw(void)
+{
+	if (test__start_subtest("bad_local_id"))
+		test_bad_local_id();
+}
-- 
cgit v1.2.3


From a11b4222bb579dcf9646f3c4ecd2212ae762a2c8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 21 Aug 2024 16:26:25 -0700
Subject: perf dwarf-aux: Handle bitfield members from pointer access

The __die_find_member_offset_cb() missed to handle bitfield members
which don't have DW_AT_data_member_location.  Like in adding member
types in __add_member_cb() it should fallback to check the bit offset
when it resolves the member type for an offset.

Fixes: 437683a9941891c1 ("perf dwarf-aux: Handle type transfer for memory access")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240821232628.353177-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-aux.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 0151a8d14350..92eb9c8dc3e5 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1977,8 +1977,15 @@ static int __die_find_member_offset_cb(Dwarf_Die *die_mem, void *arg)
 		return DIE_FIND_CB_SIBLING;
 
 	/* Unions might not have location */
-	if (die_get_data_member_location(die_mem, &loc) < 0)
-		loc = 0;
+	if (die_get_data_member_location(die_mem, &loc) < 0) {
+		Dwarf_Attribute attr;
+
+		if (dwarf_attr_integrate(die_mem, DW_AT_data_bit_offset, &attr) &&
+		    dwarf_formudata(&attr, &loc) == 0)
+			loc /= 8;
+		else
+			loc = 0;
+	}
 
 	if (offset == loc)
 		return DIE_FIND_CB_END;
-- 
cgit v1.2.3


From adec67d372fecd97585d76dbebb610d62ec9d2cc Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 22 Aug 2024 01:41:08 -0700
Subject: selftests/bpf: rename nocsr -> bpf_fastcall in selftests

Attribute used by LLVM implementation of the feature had been changed
from no_caller_saved_registers to bpf_fastcall (see [1]).
This commit replaces references to nocsr by references to bpf_fastcall
to keep LLVM and selftests parts in sync.

[1] https://github.com/llvm/llvm-project/pull/105417

Acked-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240822084112.3257995-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |   4 +-
 .../selftests/bpf/progs/verifier_bpf_fastcall.c    | 845 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/verifier_nocsr.c | 845 ---------------------
 3 files changed, 847 insertions(+), 847 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
 delete mode 100644 tools/testing/selftests/bpf/progs/verifier_nocsr.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index cf3662dbd24f..80a90c627182 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -53,7 +53,7 @@
 #include "verifier_movsx.skel.h"
 #include "verifier_netfilter_ctx.skel.h"
 #include "verifier_netfilter_retcode.skel.h"
-#include "verifier_nocsr.skel.h"
+#include "verifier_bpf_fastcall.skel.h"
 #include "verifier_or_jmp32_k.skel.h"
 #include "verifier_precision.skel.h"
 #include "verifier_prevent_map_lookup.skel.h"
@@ -177,7 +177,7 @@ void test_verifier_meta_access(void)          { RUN(verifier_meta_access); }
 void test_verifier_movsx(void)                 { RUN(verifier_movsx); }
 void test_verifier_netfilter_ctx(void)        { RUN(verifier_netfilter_ctx); }
 void test_verifier_netfilter_retcode(void)    { RUN(verifier_netfilter_retcode); }
-void test_verifier_nocsr(void)                { RUN(verifier_nocsr); }
+void test_verifier_bpf_fastcall(void)         { RUN(verifier_bpf_fastcall); }
 void test_verifier_or_jmp32_k(void)           { RUN(verifier_or_jmp32_k); }
 void test_verifier_precision(void)            { RUN(verifier_precision); }
 void test_verifier_prevent_map_lookup(void)   { RUN(verifier_prevent_map_lookup); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
new file mode 100644
index 000000000000..e30ab9fe5096
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
@@ -0,0 +1,845 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "../../../include/linux/filter.h"
+#include "bpf_misc.h"
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 8")
+__xlated("4: r5 = 5")
+__xlated("5: w0 = ")
+__xlated("6: r0 = &(void __percpu *)(r0)")
+__xlated("7: r0 = *(u32 *)(r0 +0)")
+__xlated("8: exit")
+__success
+__naked void simple(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = 2;"
+	"r3 = 3;"
+	"r4 = 4;"
+	"r5 = 5;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"*(u64 *)(r10 - 24) = r2;"
+	"*(u64 *)(r10 - 32) = r3;"
+	"*(u64 *)(r10 - 40) = r4;"
+	"*(u64 *)(r10 - 48) = r5;"
+	"call %[bpf_get_smp_processor_id];"
+	"r5 = *(u64 *)(r10 - 48);"
+	"r4 = *(u64 *)(r10 - 40);"
+	"r3 = *(u64 *)(r10 - 32);"
+	"r2 = *(u64 *)(r10 - 24);"
+	"r1 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+/* The logic for detecting and verifying bpf_fastcall pattern is the same for
+ * any arch, however x86 differs from arm64 or riscv64 in a way
+ * bpf_get_smp_processor_id is rewritten:
+ * - on x86 it is done by verifier
+ * - on arm64 and riscv64 it is done by jit
+ *
+ * Which leads to different xlated patterns for different archs:
+ * - on x86 the call is expanded as 3 instructions
+ * - on arm64 and riscv64 the call remains as is
+ *   (but spills/fills are still removed)
+ *
+ * It is really desirable to check instruction indexes in the xlated
+ * patterns, so add this canary test to check that function rewrite by
+ * jit is correctly processed by bpf_fastcall logic, keep the rest of the
+ * tests as x86.
+ */
+SEC("raw_tp")
+__arch_arm64
+__arch_riscv64
+__xlated("0: r1 = 1")
+__xlated("1: call bpf_get_smp_processor_id")
+__xlated("2: exit")
+__success
+__naked void canary_arm64_riscv64(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("3: exit")
+__success
+__naked void canary_zero_spills(void)
+{
+	asm volatile (
+	"call %[bpf_get_smp_processor_id];"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 16")
+__xlated("1: *(u64 *)(r10 -16) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r2 = *(u64 *)(r10 -16)")
+__success
+__naked void wrong_reg_in_pattern1(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -16) = r6")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r6 = *(u64 *)(r10 -16)")
+__success
+__naked void wrong_reg_in_pattern2(void)
+{
+	asm volatile (
+	"r6 = 1;"
+	"*(u64 *)(r10 - 16) = r6;"
+	"call %[bpf_get_smp_processor_id];"
+	"r6 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -16) = r0")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r0 = *(u64 *)(r10 -16)")
+__success
+__naked void wrong_reg_in_pattern3(void)
+{
+	asm volatile (
+	"r0 = 1;"
+	"*(u64 *)(r10 - 16) = r0;"
+	"call %[bpf_get_smp_processor_id];"
+	"r0 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("2: *(u64 *)(r2 -16) = r1")
+__xlated("...")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("6: r1 = *(u64 *)(r10 -16)")
+__success
+__naked void wrong_base_in_pattern(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = r10;"
+	"*(u64 *)(r2 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -16) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r2 = 1")
+__success
+__naked void wrong_insn_in_pattern(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = 1;"
+	"r1 = *(u64 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("2: *(u64 *)(r10 -16) = r1")
+__xlated("...")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("6: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void wrong_off_in_pattern1(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u32 *)(r10 -4) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r1 = *(u32 *)(r10 -4)")
+__success
+__naked void wrong_off_in_pattern2(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u32 *)(r10 - 4) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u32 *)(r10 - 4);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u32 *)(r10 -16) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r1 = *(u32 *)(r10 -16)")
+__success
+__naked void wrong_size_in_pattern(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u32 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u32 *)(r10 - 16);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("2: *(u32 *)(r10 -8) = r1")
+__xlated("...")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("6: r1 = *(u32 *)(r10 -8)")
+__success
+__naked void partial_pattern(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = 2;"
+	"*(u32 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r2;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = *(u64 *)(r10 - 16);"
+	"r1 = *(u32 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("0: r1 = 1")
+__xlated("1: r2 = 2")
+/* not patched, spills for -8, -16 not removed */
+__xlated("2: *(u64 *)(r10 -8) = r1")
+__xlated("3: *(u64 *)(r10 -16) = r2")
+__xlated("...")
+__xlated("5: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("7: r2 = *(u64 *)(r10 -16)")
+__xlated("8: r1 = *(u64 *)(r10 -8)")
+/* patched, spills for -24, -32 removed */
+__xlated("...")
+__xlated("10: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("12: exit")
+__success
+__naked void min_stack_offset(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = 2;"
+	/* this call won't be patched */
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r2;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = *(u64 *)(r10 - 16);"
+	"r1 = *(u64 *)(r10 - 8);"
+	/* this call would be patched */
+	"*(u64 *)(r10 - 24) = r1;"
+	"*(u64 *)(r10 - 32) = r2;"
+	"call %[bpf_get_smp_processor_id];"
+	"r2 = *(u64 *)(r10 - 32);"
+	"r1 = *(u64 *)(r10 - 24);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void bad_fixed_read(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"r1 = *(u64 *)(r1 - 0);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void bad_fixed_write(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"*(u64 *)(r1 - 0) = r1;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("6: *(u64 *)(r10 -16) = r1")
+__xlated("...")
+__xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("10: r1 = *(u64 *)(r10 -16)")
+__success
+__naked void bad_varying_read(void)
+{
+	asm volatile (
+	"r6 = *(u64 *)(r1 + 0);" /* random scalar value */
+	"r6 &= 0x7;"		 /* r6 range [0..7] */
+	"r6 += 0x2;"		 /* r6 range [2..9] */
+	"r7 = 0;"
+	"r7 -= r6;"		 /* r7 range [-9..-2] */
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"r1 = r10;"
+	"r1 += r7;"
+	"r1 = *(u8 *)(r1 - 0);" /* touches slot [-16..-9] where spills are stored */
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("6: *(u64 *)(r10 -16) = r1")
+__xlated("...")
+__xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("10: r1 = *(u64 *)(r10 -16)")
+__success
+__naked void bad_varying_write(void)
+{
+	asm volatile (
+	"r6 = *(u64 *)(r1 + 0);" /* random scalar value */
+	"r6 &= 0x7;"		 /* r6 range [0..7] */
+	"r6 += 0x2;"		 /* r6 range [2..9] */
+	"r7 = 0;"
+	"r7 -= r6;"		 /* r7 range [-9..-2] */
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"r1 = r10;"
+	"r1 += r7;"
+	"*(u8 *)(r1 - 0) = r7;" /* touches slot [-16..-9] where spills are stored */
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void bad_write_in_subprog(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call bad_write_in_subprog_aux;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+__used
+__naked static void bad_write_in_subprog_aux(void)
+{
+	asm volatile (
+	"r0 = 1;"
+	"*(u64 *)(r1 - 0) = r0;"	/* invalidates bpf_fastcall contract for caller: */
+	"exit;"				/* caller stack at -8 used outside of the pattern */
+	::: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__success
+__naked void bad_helper_write(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	/* bpf_fastcall pattern with stack offset -8 */
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"r2 = 1;"
+	"r3 = 42;"
+	/* read dst is fp[-8], thus bpf_fastcall rewrite not applied */
+	"call %[bpf_probe_read_kernel];"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_probe_read_kernel)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+/* main, not patched */
+__xlated("1: *(u64 *)(r10 -8) = r1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("5: r1 = *(u64 *)(r10 -8)")
+__xlated("...")
+__xlated("9: call pc+1")
+__xlated("...")
+__xlated("10: exit")
+/* subprogram, patched */
+__xlated("11: r1 = 1")
+__xlated("...")
+__xlated("13: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("15: exit")
+__success
+__naked void invalidate_one_subprog(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"r1 = *(u64 *)(r1 - 0);"
+	"call invalidate_one_subprog_aux;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+__used
+__naked static void invalidate_one_subprog_aux(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+/* main */
+__xlated("0: r1 = 1")
+__xlated("...")
+__xlated("2: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("4: call pc+1")
+__xlated("5: exit")
+/* subprogram */
+__xlated("6: r1 = 1")
+__xlated("...")
+__xlated("8: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+__xlated("10: *(u64 *)(r10 -16) = r1")
+__xlated("11: exit")
+__success
+__naked void subprogs_use_independent_offsets(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"call subprogs_use_independent_offsets_aux;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+__used
+__naked static void subprogs_use_independent_offsets_aux(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 24) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 24);"
+	"*(u64 *)(r10 - 16) = r1;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 8")
+__xlated("2: r0 = &(void __percpu *)(r0)")
+__success
+__naked void helper_call_does_not_prevent_bpf_fastcall(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 16")
+/* may_goto counter at -16 */
+__xlated("0: *(u64 *)(r10 -16) =")
+__xlated("1: r1 = 1")
+__xlated("...")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("...")
+/* may_goto expansion starts */
+__xlated("5: r11 = *(u64 *)(r10 -16)")
+__xlated("6: if r11 == 0x0 goto pc+3")
+__xlated("7: r11 -= 1")
+__xlated("8: *(u64 *)(r10 -16) = r11")
+/* may_goto expansion ends */
+__xlated("9: *(u64 *)(r10 -8) = r1")
+__xlated("10: exit")
+__success
+__naked void may_goto_interaction(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	".8byte %[may_goto];"
+	/* just touch some stack at -8 */
+	"*(u64 *)(r10 - 8) = r1;"
+	"exit;"
+	:
+	: __imm(bpf_get_smp_processor_id),
+	  __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0))
+	: __clobber_all);
+}
+
+__used
+__naked static void dummy_loop_callback(void)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 32+0")
+__xlated("2: r1 = 1")
+__xlated("3: w0 =")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("5: r0 = *(u32 *)(r0 +0)")
+/* bpf_loop params setup */
+__xlated("6: r2 =")
+__xlated("7: r3 = 0")
+__xlated("8: r4 = 0")
+__xlated("...")
+/* ... part of the inlined bpf_loop */
+__xlated("12: *(u64 *)(r10 -32) = r6")
+__xlated("13: *(u64 *)(r10 -24) = r7")
+__xlated("14: *(u64 *)(r10 -16) = r8")
+__xlated("...")
+__xlated("21: call pc+8") /* dummy_loop_callback */
+/* ... last insns of the bpf_loop_interaction1 */
+__xlated("...")
+__xlated("28: r0 = 0")
+__xlated("29: exit")
+/* dummy_loop_callback */
+__xlated("30: r0 = 0")
+__xlated("31: exit")
+__success
+__naked int bpf_loop_interaction1(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	/* bpf_fastcall stack region at -16, but could be removed */
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"r2 = %[dummy_loop_callback];"
+	"r3 = 0;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_ptr(dummy_loop_callback),
+	  __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_loop)
+	: __clobber_common
+	);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4) __msg("stack depth 40+0")
+/* call bpf_get_smp_processor_id */
+__xlated("2: r1 = 42")
+__xlated("3: w0 =")
+__xlated("4: r0 = &(void __percpu *)(r0)")
+__xlated("5: r0 = *(u32 *)(r0 +0)")
+/* call bpf_get_prandom_u32 */
+__xlated("6: *(u64 *)(r10 -16) = r1")
+__xlated("7: call")
+__xlated("8: r1 = *(u64 *)(r10 -16)")
+__xlated("...")
+/* ... part of the inlined bpf_loop */
+__xlated("15: *(u64 *)(r10 -40) = r6")
+__xlated("16: *(u64 *)(r10 -32) = r7")
+__xlated("17: *(u64 *)(r10 -24) = r8")
+__success
+__naked int bpf_loop_interaction2(void)
+{
+	asm volatile (
+	"r1 = 42;"
+	/* bpf_fastcall stack region at -16, cannot be removed */
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"*(u64 *)(r10 - 16) = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"r1 = *(u64 *)(r10 - 16);"
+	"r2 = %[dummy_loop_callback];"
+	"r3 = 0;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_ptr(dummy_loop_callback),
+	  __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_get_prandom_u32),
+	  __imm(bpf_loop)
+	: __clobber_common
+	);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4)
+__msg("stack depth 512+0")
+/* just to print xlated version when debugging */
+__xlated("r0 = &(void __percpu *)(r0)")
+__success
+/* cumulative_stack_depth() stack usage is MAX_BPF_STACK,
+ * called subprogram uses an additional slot for bpf_fastcall spill/fill,
+ * since bpf_fastcall spill/fill could be removed the program still fits
+ * in MAX_BPF_STACK and should be accepted.
+ */
+__naked int cumulative_stack_depth(void)
+{
+	asm volatile(
+	"r1 = 42;"
+	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
+	"call cumulative_stack_depth_subprog;"
+	"exit;"
+	:
+	: __imm_const(max_bpf_stack, MAX_BPF_STACK)
+	: __clobber_all
+	);
+}
+
+__used
+__naked static void cumulative_stack_depth_subprog(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:: __imm(bpf_get_smp_processor_id) : __clobber_all);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4)
+__msg("stack depth 512")
+__xlated("0: r1 = 42")
+__xlated("1: *(u64 *)(r10 -512) = r1")
+__xlated("2: w0 = ")
+__xlated("3: r0 = &(void __percpu *)(r0)")
+__xlated("4: r0 = *(u32 *)(r0 +0)")
+__xlated("5: exit")
+__success
+__naked int bpf_fastcall_max_stack_ok(void)
+{
+	asm volatile(
+	"r1 = 42;"
+	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
+	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
+	"exit;"
+	:
+	: __imm_const(max_bpf_stack, MAX_BPF_STACK),
+	  __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8),
+	  __imm(bpf_get_smp_processor_id)
+	: __clobber_all
+	);
+}
+
+SEC("raw_tp")
+__arch_x86_64
+__log_level(4)
+__msg("stack depth 520")
+__failure
+__naked int bpf_fastcall_max_stack_fail(void)
+{
+	asm volatile(
+	"r1 = 42;"
+	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
+	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
+	"call %[bpf_get_smp_processor_id];"
+	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
+	/* call to prandom blocks bpf_fastcall rewrite */
+	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
+	"exit;"
+	:
+	: __imm_const(max_bpf_stack, MAX_BPF_STACK),
+	  __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8),
+	  __imm(bpf_get_smp_processor_id),
+	  __imm(bpf_get_prandom_u32)
+	: __clobber_all
+	);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_nocsr.c
deleted file mode 100644
index 666c736d196f..000000000000
--- a/tools/testing/selftests/bpf/progs/verifier_nocsr.c
+++ /dev/null
@@ -1,845 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-#include "../../../include/linux/filter.h"
-#include "bpf_misc.h"
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4) __msg("stack depth 8")
-__xlated("4: r5 = 5")
-__xlated("5: w0 = ")
-__xlated("6: r0 = &(void __percpu *)(r0)")
-__xlated("7: r0 = *(u32 *)(r0 +0)")
-__xlated("8: exit")
-__success
-__naked void simple(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"r2 = 2;"
-	"r3 = 3;"
-	"r4 = 4;"
-	"r5 = 5;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"*(u64 *)(r10 - 24) = r2;"
-	"*(u64 *)(r10 - 32) = r3;"
-	"*(u64 *)(r10 - 40) = r4;"
-	"*(u64 *)(r10 - 48) = r5;"
-	"call %[bpf_get_smp_processor_id];"
-	"r5 = *(u64 *)(r10 - 48);"
-	"r4 = *(u64 *)(r10 - 40);"
-	"r3 = *(u64 *)(r10 - 32);"
-	"r2 = *(u64 *)(r10 - 24);"
-	"r1 = *(u64 *)(r10 - 16);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-/* The logic for detecting and verifying nocsr pattern is the same for
- * any arch, however x86 differs from arm64 or riscv64 in a way
- * bpf_get_smp_processor_id is rewritten:
- * - on x86 it is done by verifier
- * - on arm64 and riscv64 it is done by jit
- *
- * Which leads to different xlated patterns for different archs:
- * - on x86 the call is expanded as 3 instructions
- * - on arm64 and riscv64 the call remains as is
- *   (but spills/fills are still removed)
- *
- * It is really desirable to check instruction indexes in the xlated
- * patterns, so add this canary test to check that function rewrite by
- * jit is correctly processed by nocsr logic, keep the rest of the
- * tests as x86.
- */
-SEC("raw_tp")
-__arch_arm64
-__arch_riscv64
-__xlated("0: r1 = 1")
-__xlated("1: call bpf_get_smp_processor_id")
-__xlated("2: exit")
-__success
-__naked void canary_arm64_riscv64(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 16);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("3: exit")
-__success
-__naked void canary_zero_spills(void)
-{
-	asm volatile (
-	"call %[bpf_get_smp_processor_id];"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4) __msg("stack depth 16")
-__xlated("1: *(u64 *)(r10 -16) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r2 = *(u64 *)(r10 -16)")
-__success
-__naked void wrong_reg_in_pattern1(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r2 = *(u64 *)(r10 - 16);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u64 *)(r10 -16) = r6")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r6 = *(u64 *)(r10 -16)")
-__success
-__naked void wrong_reg_in_pattern2(void)
-{
-	asm volatile (
-	"r6 = 1;"
-	"*(u64 *)(r10 - 16) = r6;"
-	"call %[bpf_get_smp_processor_id];"
-	"r6 = *(u64 *)(r10 - 16);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u64 *)(r10 -16) = r0")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r0 = *(u64 *)(r10 -16)")
-__success
-__naked void wrong_reg_in_pattern3(void)
-{
-	asm volatile (
-	"r0 = 1;"
-	"*(u64 *)(r10 - 16) = r0;"
-	"call %[bpf_get_smp_processor_id];"
-	"r0 = *(u64 *)(r10 - 16);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("2: *(u64 *)(r2 -16) = r1")
-__xlated("...")
-__xlated("4: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("6: r1 = *(u64 *)(r10 -16)")
-__success
-__naked void wrong_base_in_pattern(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"r2 = r10;"
-	"*(u64 *)(r2 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 16);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u64 *)(r10 -16) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r2 = 1")
-__success
-__naked void wrong_insn_in_pattern(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r2 = 1;"
-	"r1 = *(u64 *)(r10 - 16);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("2: *(u64 *)(r10 -16) = r1")
-__xlated("...")
-__xlated("4: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("6: r1 = *(u64 *)(r10 -8)")
-__success
-__naked void wrong_off_in_pattern1(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 8) = r1;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u32 *)(r10 -4) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r1 = *(u32 *)(r10 -4)")
-__success
-__naked void wrong_off_in_pattern2(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u32 *)(r10 - 4) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u32 *)(r10 - 4);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u32 *)(r10 -16) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r1 = *(u32 *)(r10 -16)")
-__success
-__naked void wrong_size_in_pattern(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u32 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u32 *)(r10 - 16);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("2: *(u32 *)(r10 -8) = r1")
-__xlated("...")
-__xlated("4: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("6: r1 = *(u32 *)(r10 -8)")
-__success
-__naked void partial_pattern(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"r2 = 2;"
-	"*(u32 *)(r10 - 8) = r1;"
-	"*(u64 *)(r10 - 16) = r2;"
-	"call %[bpf_get_smp_processor_id];"
-	"r2 = *(u64 *)(r10 - 16);"
-	"r1 = *(u32 *)(r10 - 8);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("0: r1 = 1")
-__xlated("1: r2 = 2")
-/* not patched, spills for -8, -16 not removed */
-__xlated("2: *(u64 *)(r10 -8) = r1")
-__xlated("3: *(u64 *)(r10 -16) = r2")
-__xlated("...")
-__xlated("5: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("7: r2 = *(u64 *)(r10 -16)")
-__xlated("8: r1 = *(u64 *)(r10 -8)")
-/* patched, spills for -24, -32 removed */
-__xlated("...")
-__xlated("10: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("12: exit")
-__success
-__naked void min_stack_offset(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"r2 = 2;"
-	/* this call won't be patched */
-	"*(u64 *)(r10 - 8) = r1;"
-	"*(u64 *)(r10 - 16) = r2;"
-	"call %[bpf_get_smp_processor_id];"
-	"r2 = *(u64 *)(r10 - 16);"
-	"r1 = *(u64 *)(r10 - 8);"
-	/* this call would be patched */
-	"*(u64 *)(r10 - 24) = r1;"
-	"*(u64 *)(r10 - 32) = r2;"
-	"call %[bpf_get_smp_processor_id];"
-	"r2 = *(u64 *)(r10 - 32);"
-	"r1 = *(u64 *)(r10 - 24);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u64 *)(r10 -8) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r1 = *(u64 *)(r10 -8)")
-__success
-__naked void bad_fixed_read(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"r1 = r10;"
-	"r1 += -8;"
-	"r1 = *(u64 *)(r1 - 0);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u64 *)(r10 -8) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r1 = *(u64 *)(r10 -8)")
-__success
-__naked void bad_fixed_write(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"r1 = r10;"
-	"r1 += -8;"
-	"*(u64 *)(r1 - 0) = r1;"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("6: *(u64 *)(r10 -16) = r1")
-__xlated("...")
-__xlated("8: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("10: r1 = *(u64 *)(r10 -16)")
-__success
-__naked void bad_varying_read(void)
-{
-	asm volatile (
-	"r6 = *(u64 *)(r1 + 0);" /* random scalar value */
-	"r6 &= 0x7;"		 /* r6 range [0..7] */
-	"r6 += 0x2;"		 /* r6 range [2..9] */
-	"r7 = 0;"
-	"r7 -= r6;"		 /* r7 range [-9..-2] */
-	"r1 = 1;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 16);"
-	"r1 = r10;"
-	"r1 += r7;"
-	"r1 = *(u8 *)(r1 - 0);" /* touches slot [-16..-9] where spills are stored */
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("6: *(u64 *)(r10 -16) = r1")
-__xlated("...")
-__xlated("8: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("10: r1 = *(u64 *)(r10 -16)")
-__success
-__naked void bad_varying_write(void)
-{
-	asm volatile (
-	"r6 = *(u64 *)(r1 + 0);" /* random scalar value */
-	"r6 &= 0x7;"		 /* r6 range [0..7] */
-	"r6 += 0x2;"		 /* r6 range [2..9] */
-	"r7 = 0;"
-	"r7 -= r6;"		 /* r7 range [-9..-2] */
-	"r1 = 1;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 16);"
-	"r1 = r10;"
-	"r1 += r7;"
-	"*(u8 *)(r1 - 0) = r7;" /* touches slot [-16..-9] where spills are stored */
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u64 *)(r10 -8) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r1 = *(u64 *)(r10 -8)")
-__success
-__naked void bad_write_in_subprog(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"r1 = r10;"
-	"r1 += -8;"
-	"call bad_write_in_subprog_aux;"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-__used
-__naked static void bad_write_in_subprog_aux(void)
-{
-	asm volatile (
-	"r0 = 1;"
-	"*(u64 *)(r1 - 0) = r0;"	/* invalidates nocsr contract for caller: */
-	"exit;"				/* caller stack at -8 used outside of the pattern */
-	::: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__xlated("1: *(u64 *)(r10 -8) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r1 = *(u64 *)(r10 -8)")
-__success
-__naked void bad_helper_write(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	/* nocsr pattern with stack offset -8 */
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"r1 = r10;"
-	"r1 += -8;"
-	"r2 = 1;"
-	"r3 = 42;"
-	/* read dst is fp[-8], thus nocsr rewrite not applied */
-	"call %[bpf_probe_read_kernel];"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id),
-	  __imm(bpf_probe_read_kernel)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-/* main, not patched */
-__xlated("1: *(u64 *)(r10 -8) = r1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("5: r1 = *(u64 *)(r10 -8)")
-__xlated("...")
-__xlated("9: call pc+1")
-__xlated("...")
-__xlated("10: exit")
-/* subprogram, patched */
-__xlated("11: r1 = 1")
-__xlated("...")
-__xlated("13: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("15: exit")
-__success
-__naked void invalidate_one_subprog(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"r1 = r10;"
-	"r1 += -8;"
-	"r1 = *(u64 *)(r1 - 0);"
-	"call invalidate_one_subprog_aux;"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-__used
-__naked static void invalidate_one_subprog_aux(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-/* main */
-__xlated("0: r1 = 1")
-__xlated("...")
-__xlated("2: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("4: call pc+1")
-__xlated("5: exit")
-/* subprogram */
-__xlated("6: r1 = 1")
-__xlated("...")
-__xlated("8: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-__xlated("10: *(u64 *)(r10 -16) = r1")
-__xlated("11: exit")
-__success
-__naked void subprogs_use_independent_offsets(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 16);"
-	"call subprogs_use_independent_offsets_aux;"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-__used
-__naked static void subprogs_use_independent_offsets_aux(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 24) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 24);"
-	"*(u64 *)(r10 - 16) = r1;"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4) __msg("stack depth 8")
-__xlated("2: r0 = &(void __percpu *)(r0)")
-__success
-__naked void helper_call_does_not_prevent_nocsr(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_prandom_u32];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id),
-	  __imm(bpf_get_prandom_u32)
-	: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4) __msg("stack depth 16")
-/* may_goto counter at -16 */
-__xlated("0: *(u64 *)(r10 -16) =")
-__xlated("1: r1 = 1")
-__xlated("...")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("...")
-/* may_goto expansion starts */
-__xlated("5: r11 = *(u64 *)(r10 -16)")
-__xlated("6: if r11 == 0x0 goto pc+3")
-__xlated("7: r11 -= 1")
-__xlated("8: *(u64 *)(r10 -16) = r11")
-/* may_goto expansion ends */
-__xlated("9: *(u64 *)(r10 -8) = r1")
-__xlated("10: exit")
-__success
-__naked void may_goto_interaction(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 16);"
-	".8byte %[may_goto];"
-	/* just touch some stack at -8 */
-	"*(u64 *)(r10 - 8) = r1;"
-	"exit;"
-	:
-	: __imm(bpf_get_smp_processor_id),
-	  __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0))
-	: __clobber_all);
-}
-
-__used
-__naked static void dummy_loop_callback(void)
-{
-	asm volatile (
-	"r0 = 0;"
-	"exit;"
-	::: __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4) __msg("stack depth 32+0")
-__xlated("2: r1 = 1")
-__xlated("3: w0 =")
-__xlated("4: r0 = &(void __percpu *)(r0)")
-__xlated("5: r0 = *(u32 *)(r0 +0)")
-/* bpf_loop params setup */
-__xlated("6: r2 =")
-__xlated("7: r3 = 0")
-__xlated("8: r4 = 0")
-__xlated("...")
-/* ... part of the inlined bpf_loop */
-__xlated("12: *(u64 *)(r10 -32) = r6")
-__xlated("13: *(u64 *)(r10 -24) = r7")
-__xlated("14: *(u64 *)(r10 -16) = r8")
-__xlated("...")
-__xlated("21: call pc+8") /* dummy_loop_callback */
-/* ... last insns of the bpf_loop_interaction1 */
-__xlated("...")
-__xlated("28: r0 = 0")
-__xlated("29: exit")
-/* dummy_loop_callback */
-__xlated("30: r0 = 0")
-__xlated("31: exit")
-__success
-__naked int bpf_loop_interaction1(void)
-{
-	asm volatile (
-	"r1 = 1;"
-	/* nocsr stack region at -16, but could be removed */
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 16);"
-	"r2 = %[dummy_loop_callback];"
-	"r3 = 0;"
-	"r4 = 0;"
-	"call %[bpf_loop];"
-	"r0 = 0;"
-	"exit;"
-	:
-	: __imm_ptr(dummy_loop_callback),
-	  __imm(bpf_get_smp_processor_id),
-	  __imm(bpf_loop)
-	: __clobber_common
-	);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4) __msg("stack depth 40+0")
-/* call bpf_get_smp_processor_id */
-__xlated("2: r1 = 42")
-__xlated("3: w0 =")
-__xlated("4: r0 = &(void __percpu *)(r0)")
-__xlated("5: r0 = *(u32 *)(r0 +0)")
-/* call bpf_get_prandom_u32 */
-__xlated("6: *(u64 *)(r10 -16) = r1")
-__xlated("7: call")
-__xlated("8: r1 = *(u64 *)(r10 -16)")
-__xlated("...")
-/* ... part of the inlined bpf_loop */
-__xlated("15: *(u64 *)(r10 -40) = r6")
-__xlated("16: *(u64 *)(r10 -32) = r7")
-__xlated("17: *(u64 *)(r10 -24) = r8")
-__success
-__naked int bpf_loop_interaction2(void)
-{
-	asm volatile (
-	"r1 = 42;"
-	/* nocsr stack region at -16, cannot be removed */
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 16);"
-	"*(u64 *)(r10 - 16) = r1;"
-	"call %[bpf_get_prandom_u32];"
-	"r1 = *(u64 *)(r10 - 16);"
-	"r2 = %[dummy_loop_callback];"
-	"r3 = 0;"
-	"r4 = 0;"
-	"call %[bpf_loop];"
-	"r0 = 0;"
-	"exit;"
-	:
-	: __imm_ptr(dummy_loop_callback),
-	  __imm(bpf_get_smp_processor_id),
-	  __imm(bpf_get_prandom_u32),
-	  __imm(bpf_loop)
-	: __clobber_common
-	);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4)
-__msg("stack depth 512+0")
-/* just to print xlated version when debugging */
-__xlated("r0 = &(void __percpu *)(r0)")
-__success
-/* cumulative_stack_depth() stack usage is MAX_BPF_STACK,
- * called subprogram uses an additional slot for nocsr spill/fill,
- * since nocsr spill/fill could be removed the program still fits
- * in MAX_BPF_STACK and should be accepted.
- */
-__naked int cumulative_stack_depth(void)
-{
-	asm volatile(
-	"r1 = 42;"
-	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
-	"call cumulative_stack_depth_subprog;"
-	"exit;"
-	:
-	: __imm_const(max_bpf_stack, MAX_BPF_STACK)
-	: __clobber_all
-	);
-}
-
-__used
-__naked static void cumulative_stack_depth_subprog(void)
-{
-	asm volatile (
-	"*(u64 *)(r10 - 8) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - 8);"
-	"exit;"
-	:: __imm(bpf_get_smp_processor_id) : __clobber_all);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4)
-__msg("stack depth 512")
-__xlated("0: r1 = 42")
-__xlated("1: *(u64 *)(r10 -512) = r1")
-__xlated("2: w0 = ")
-__xlated("3: r0 = &(void __percpu *)(r0)")
-__xlated("4: r0 = *(u32 *)(r0 +0)")
-__xlated("5: exit")
-__success
-__naked int nocsr_max_stack_ok(void)
-{
-	asm volatile(
-	"r1 = 42;"
-	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
-	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
-	"exit;"
-	:
-	: __imm_const(max_bpf_stack, MAX_BPF_STACK),
-	  __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8),
-	  __imm(bpf_get_smp_processor_id)
-	: __clobber_all
-	);
-}
-
-SEC("raw_tp")
-__arch_x86_64
-__log_level(4)
-__msg("stack depth 520")
-__failure
-__naked int nocsr_max_stack_fail(void)
-{
-	asm volatile(
-	"r1 = 42;"
-	"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
-	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
-	"call %[bpf_get_smp_processor_id];"
-	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
-	/* call to prandom blocks nocsr rewrite */
-	"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
-	"call %[bpf_get_prandom_u32];"
-	"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
-	"exit;"
-	:
-	: __imm_const(max_bpf_stack, MAX_BPF_STACK),
-	  __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8),
-	  __imm(bpf_get_smp_processor_id),
-	  __imm(bpf_get_prandom_u32)
-	: __clobber_all
-	);
-}
-
-char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From f406026fefa745ff9a3153507cc3b9a87371d954 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 22 Aug 2024 01:41:11 -0700
Subject: selftests/bpf: by default use arch mask allowing all archs

If test case does not specify architecture via __arch_* macro consider
that it should be run for all architectures.

Fixes: 7d743e4c759c ("selftests/bpf: __jited test tag to check disassembly after jit")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240822084112.3257995-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_loader.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index b229dd013355..2ca9b73e5a6b 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -543,7 +543,7 @@ static int parse_test_spec(struct test_loader *tester,
 		}
 	}
 
-	spec->arch_mask = arch_mask;
+	spec->arch_mask = arch_mask ?: -1;
 
 	if (spec->mode_mask == 0)
 		spec->mode_mask = PRIV;
-- 
cgit v1.2.3


From 8c2e043daadad021fc501ac64cce131f48c3ca46 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 22 Aug 2024 01:41:12 -0700
Subject: selftests/bpf: check if bpf_fastcall is recognized for kfuncs

Use kfunc_bpf_cast_to_kern_ctx() and kfunc_bpf_rdonly_cast() to verify
that bpf_fastcall pattern is recognized for kfunc calls.

Acked-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240822084112.3257995-7-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_bpf_fastcall.c    | 55 ++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
index e30ab9fe5096..9da97d2efcd9 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
@@ -2,8 +2,11 @@
 
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
 #include "../../../include/linux/filter.h"
 #include "bpf_misc.h"
+#include <stdbool.h>
+#include "bpf_kfuncs.h"
 
 SEC("raw_tp")
 __arch_x86_64
@@ -842,4 +845,56 @@ __naked int bpf_fastcall_max_stack_fail(void)
 	);
 }
 
+SEC("cgroup/getsockname_unix")
+__xlated("0: r2 = 1")
+/* bpf_cast_to_kern_ctx is replaced by a single assignment */
+__xlated("1: r0 = r1")
+__xlated("2: r0 = r2")
+__xlated("3: exit")
+__success
+__naked void kfunc_bpf_cast_to_kern_ctx(void)
+{
+	asm volatile (
+	"r2 = 1;"
+	"*(u64 *)(r10 - 32) = r2;"
+	"call %[bpf_cast_to_kern_ctx];"
+	"r2 = *(u64 *)(r10 - 32);"
+	"r0 = r2;"
+	"exit;"
+	:
+	: __imm(bpf_cast_to_kern_ctx)
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__xlated("3: r3 = 1")
+/* bpf_rdonly_cast is replaced by a single assignment */
+__xlated("4: r0 = r1")
+__xlated("5: r0 = r3")
+void kfunc_bpf_rdonly_cast(void)
+{
+	asm volatile (
+	"r2 = %[btf_id];"
+	"r3 = 1;"
+	"*(u64 *)(r10 - 32) = r3;"
+	"call %[bpf_rdonly_cast];"
+	"r3 = *(u64 *)(r10 - 32);"
+	"r0 = r3;"
+	:
+	: __imm(bpf_rdonly_cast),
+	 [btf_id]"r"(bpf_core_type_id_kernel(union bpf_attr))
+	: __clobber_common);
+}
+
+/* BTF FUNC records are not generated for kfuncs referenced
+ * from inline assembly. These records are necessary for
+ * libbpf to link the program. The function below is a hack
+ * to ensure that BTF FUNC records are generated.
+ */
+void kfunc_root(void)
+{
+	bpf_cast_to_kern_ctx(0);
+	bpf_rdonly_cast(0, 0);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From a0d57c606188ebf106f26951558c3801627dd0a1 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 21 Aug 2024 16:26:26 -0700
Subject: perf annotate-data: Update debug messages

In check_matching_type(), it'd be easier to display the typename in
question if it's available.

For example, check out the line starts with 'chk'.
  -----------------------------------------------------------
  find data type for 0x10(reg0) at cpuacct_charge+0x13
  CU for kernel/sched/build_utility.c (die:0x137ee0b)
  frame base: cfa=1 fbreg=7
  scope: [3/3] (die:13d9632)
  bb: [c - 13]
  var [c] reg5 type='struct task_struct*' size=0x8 (die:0x1381230)
  mov [c] 0xdf8(reg5) -> reg0 type='struct css_set*' size=0x8 (die:0x1385c56)
  chk [13] reg0 offset=0x10 ok=1 kind=1 (struct css_set*) : Good!         <<<--- here
  found by insn track: 0x10(reg0) type-offset=0x10
  final result:  type='struct css_set' size=0x250 (die:0x1385b0e)

Another example:
  -----------------------------------------------------------
  find data type for 0x8(reg0) at menu_select+0x279
  CU for drivers/cpuidle/governors/menu.c (die:0x7b0fe79)
  frame base: cfa=1 fbreg=7
  scope: [2/2] (die:7b11010)
  bb: [273 - 277]
  bb: [279 - 279]
  chk [279] reg0 offset=0x8 ok=0 kind=0 cfa : no type information
  scope: [1/2] (die:7b10cbc)
  bb: [0 - 64]
  ...
  mov [26a] imm=0xffffffff -> reg15
  bb: [273 - 277]
  bb: [279 - 279]
  chk [279] reg0 offset=0x8 ok=1 kind=1 (long long unsigned int) : no/void pointer    <<<--- here
  final result: no/void pointer

Also change some places to print negative offsets properly.

Before:
  -----------------------------------------------------------
  find data type for 0xffffff40(reg6) at __tcp_transmit_skb+0x58

After:
  -----------------------------------------------------------
  find data type for -0xc0(reg6) at __tcp_transmit_skb+0x58

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240821232628.353177-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 45 ++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 1e8328dde720..7d4b5aaa3b6a 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -960,9 +960,16 @@ static enum type_match_result check_matching_type(struct type_state *state,
 	Dwarf_Word size;
 	u32 insn_offset = dloc->ip - dloc->ms->sym->start;
 	int reg = dloc->op->reg1;
+	int offset = dloc->op->offset;
+	const char *offset_sign = "";
 
-	pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d ",
-		     insn_offset, reg, dloc->op->offset,
+	if (offset < 0) {
+		offset = -offset;
+		offset_sign = "-";
+	}
+
+	pr_debug_dtp("chk [%x] reg%d offset=%s%#x ok=%d kind=%d ",
+		     insn_offset, reg, offset_sign, offset,
 		     state->regs[reg].ok, state->regs[reg].kind);
 
 	if (!state->regs[reg].ok)
@@ -970,6 +977,12 @@ static enum type_match_result check_matching_type(struct type_state *state,
 
 	if (state->regs[reg].kind == TSR_KIND_TYPE) {
 		Dwarf_Die sized_type;
+		struct strbuf sb;
+
+		strbuf_init(&sb, 32);
+		die_get_typename_from_type(&state->regs[reg].type, &sb);
+		pr_debug_dtp("(%s)", sb.buf);
+		strbuf_release(&sb);
 
 		/*
 		 * Normal registers should hold a pointer (or array) to
@@ -1119,7 +1132,6 @@ check_non_register:
 check_kernel:
 	if (dso__kernel(map__dso(dloc->ms->map))) {
 		u64 addr;
-		int offset;
 
 		/* Direct this-cpu access like "%gs:0x34740" */
 		if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm &&
@@ -1271,6 +1283,13 @@ again:
 					    cu_die, type_die);
 		if (ret == PERF_TMR_OK) {
 			char buf[64];
+			int offset = dloc->op->offset;
+			const char *offset_sign = "";
+
+			if (offset < 0) {
+				offset = -offset;
+				offset_sign = "-";
+			}
 
 			if (dloc->op->multi_regs)
 				snprintf(buf, sizeof(buf), "reg%d, reg%d",
@@ -1278,8 +1297,8 @@ again:
 			else
 				snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1);
 
-			pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
-				     dloc->op->offset, buf, dloc->type_offset);
+			pr_debug_dtp("found by insn track: %s%#x(%s) type-offset=%#x\n",
+				     offset_sign, offset, buf, dloc->type_offset);
 			break;
 		}
 
@@ -1302,7 +1321,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
 	struct annotated_op_loc *loc = dloc->op;
 	Dwarf_Die cu_die, var_die;
 	Dwarf_Die *scopes = NULL;
-	int reg, offset;
+	int reg, offset = loc->offset;
 	int ret = -1;
 	int i, nr_scopes;
 	int fbreg = -1;
@@ -1312,6 +1331,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
 	u64 pc;
 	char buf[64];
 	enum type_match_result result = PERF_TMR_UNKNOWN;
+	const char *offset_sign = "";
 
 	if (dloc->op->multi_regs)
 		snprintf(buf, sizeof(buf), "reg%d, reg%d", dloc->op->reg1, dloc->op->reg2);
@@ -1320,10 +1340,15 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
 	else
 		snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1);
 
+	if (offset < 0) {
+		offset = -offset;
+		offset_sign = "-";
+	}
+
 	pr_debug_dtp("-----------------------------------------------------------\n");
-	pr_debug_dtp("find data type for %#x(%s) at %s+%#"PRIx64"\n",
-		     dloc->op->offset, buf, dloc->ms->sym->name,
-		     dloc->ip - dloc->ms->sym->start);
+	pr_debug_dtp("find data type for %s%#x(%s) at %s+%#"PRIx64"\n",
+		     offset_sign, offset, buf,
+		     dloc->ms->sym->name, dloc->ip - dloc->ms->sym->start);
 
 	/*
 	 * IP is a relative instruction address from the start of the map, as
@@ -1453,8 +1478,8 @@ retry:
 	}
 
 out:
+	pr_debug_dtp("final result: ");
 	if (found) {
-		pr_debug_dtp("final type:");
 		pr_debug_type_name(type_die, TSR_KIND_TYPE);
 		ret = 0;
 	} else {
-- 
cgit v1.2.3


From 895891dad7353d6058a8e5c499cf814fe7de9388 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 21 Aug 2024 16:26:27 -0700
Subject: perf annotate-data: Update stack slot for the store

When checking the match variable at the target instruction, it might not
have any information if it's a first write to a stack slot.  In this
case it could spill a register value into the stack so the type info is
in the source operand.

But currently it's hard to get the operand from the checking function.
Let's process the instruction and retry to get the type info from the
stack if there's no information already.

This is an example of __tcp_transmit_skb().  The instructions are

  <__tcp_transmit_skb>:
   0: nopl   0x0(%rax, %rax, 1)
   5: push   %rbp
   6: mov    %rsp, %rbp
   9: push   %r15
   b: push   %r14
   d: push   %r13
   f: push   %r12
  11: push   %rbx
  12: sub    $0x98, %rsp
  19: mov    %r8d, -0xa8(%rbp)
  ...

It cannot find any variable at -0xa8(%rbp) at this point.
  -----------------------------------------------------------
  find data type for -0xa8(reg6) at __tcp_transmit_skb+0x19
  CU for net/ipv4/tcp_output.c (die:0x817f543)
  frame base: cfa=0 fbreg=6
  scope: [1/1] (die:81aac3e)
  bb: [0 - 19]
  var [0] -0x98(stack) type='struct tcp_out_options' size=0x28 (die:0x81af3df)
  var [5] reg8 type='unsigned int' size=0x4 (die:0x8180ed6)
  var [5] reg2 type='unsigned int' size=0x4 (die:0x8180ed6)
  var [5] reg1 type='int' size=0x4 (die:0x818059e)
  var [5] reg4 type='struct sk_buff*' size=0x8 (die:0x8181360)
  var [5] reg5 type='struct sock*' size=0x8 (die:0x8181a0c)
  chk [19] reg6 offset=-0xa8 ok=0 kind=0 fbreg : no type information
  no type information

And it was able to find the type after processing the 'mov' instruction.
  -----------------------------------------------------------
  find data type for -0xa8(reg6) at __tcp_transmit_skb+0x19
  CU for net/ipv4/tcp_output.c (die:0x817f543)
  frame base: cfa=0 fbreg=6
  scope: [1/1] (die:81aac3e)
  bb: [0 - 19]
  var [0] -0x98(stack) type='struct tcp_out_options' size=0x28 (die:0x81af3df)
  var [5] reg8 type='unsigned int' size=0x4 (die:0x8180ed6)
  var [5] reg2 type='unsigned int' size=0x4 (die:0x8180ed6)
  var [5] reg1 type='int' size=0x4 (die:0x818059e)
  var [5] reg4 type='struct sk_buff*' size=0x8 (die:0x8181360)
  var [5] reg5 type='struct sock*' size=0x8 (die:0x8181a0c)
  chk [19] reg6 offset=-0xa8 ok=0 kind=0 fbreg : retry                    <<<--- here
  mov [19] reg8 -> -0xa8(stack) type='unsigned int' size=0x4 (die:0x8180ed6)
  chk [19] reg6 offset=-0xa8 ok=0 kind=0 fbreg : Good!
  found by insn track: -0xa8(reg6) type-offset=0
  final result:  type='unsigned int' size=0x4 (die:0x8180ed6)

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240821232628.353177-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 7d4b5aaa3b6a..5d23c30b3a7b 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -955,19 +955,22 @@ static void setup_stack_canary(struct data_loc_info *dloc)
 static enum type_match_result check_matching_type(struct type_state *state,
 						  struct data_loc_info *dloc,
 						  Dwarf_Die *cu_die,
+						  struct disasm_line *dl,
 						  Dwarf_Die *type_die)
 {
 	Dwarf_Word size;
-	u32 insn_offset = dloc->ip - dloc->ms->sym->start;
+	u32 insn_offset = dl->al.offset;
 	int reg = dloc->op->reg1;
 	int offset = dloc->op->offset;
 	const char *offset_sign = "";
+	bool retry = true;
 
 	if (offset < 0) {
 		offset = -offset;
 		offset_sign = "-";
 	}
 
+again:
 	pr_debug_dtp("chk [%x] reg%d offset=%s%#x ok=%d kind=%d ",
 		     insn_offset, reg, offset_sign, offset,
 		     state->regs[reg].ok, state->regs[reg].kind);
@@ -1079,8 +1082,17 @@ check_non_register:
 		pr_debug_dtp("fbreg");
 
 		stack = find_stack_state(state, dloc->type_offset);
-		if (stack == NULL)
+		if (stack == NULL) {
+			if (retry) {
+				pr_debug_dtp(" : retry\n");
+				retry = false;
+
+				/* update type info it's the first store to the stack */
+				update_insn_state(state, dloc, cu_die, dl);
+				goto again;
+			}
 			return PERF_TMR_NO_TYPE;
+		}
 
 		if (stack->kind == TSR_KIND_CANARY) {
 			setup_stack_canary(dloc);
@@ -1111,8 +1123,17 @@ check_non_register:
 			return PERF_TMR_NO_TYPE;
 
 		stack = find_stack_state(state, dloc->type_offset - fboff);
-		if (stack == NULL)
+		if (stack == NULL) {
+			if (retry) {
+				pr_debug_dtp(" : retry\n");
+				retry = false;
+
+				/* update type info it's the first store to the stack */
+				update_insn_state(state, dloc, cu_die, dl);
+				goto again;
+			}
 			return PERF_TMR_NO_TYPE;
+		}
 
 		if (stack->kind == TSR_KIND_CANARY) {
 			setup_stack_canary(dloc);
@@ -1202,7 +1223,7 @@ static enum type_match_result find_data_type_insn(struct data_loc_info *dloc,
 
 			if (this_ip == dloc->ip) {
 				ret = check_matching_type(&state, dloc,
-							  cu_die, type_die);
+							  cu_die, dl, type_die);
 				pr_debug_dtp(" : %s\n", match_result_str(ret));
 				goto out;
 			}
-- 
cgit v1.2.3


From 1cfd01eb602d73b92df2ffc24196cd0a3dc3efb2 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 21 Aug 2024 16:26:28 -0700
Subject: perf annotate-data: Copy back variable types after move

In some cases, compilers don't set the location expression in DWARF
precisely.  For instance, it may assign a variable to a register after
copying it from a different register.  Then it should use the register
for the new type but still uses the old register.  This makes hard to
track the type information properly.

This is an example I found in __tcp_transmit_skb().  The first argument
(sk) of this function is a pointer to sock and there's a variable (tp)
for tcp_sock.

  static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
  				int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
  {
  	...
  	struct tcp_sock *tp;

  	BUG_ON(!skb || !tcp_skb_pcount(skb));
  	tp = tcp_sk(sk);
  	prior_wstamp = tp->tcp_wstamp_ns;
  	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
  	...

So it basically calls tcp_sk(sk) to get the tcp_sock pointer from sk.
But it turned out to be the same value because tcp_sock embeds sock as
the first member.  The sk is located in reg5 (RDI) and tp is in reg3
(RBX).  The offset of tcp_wstamp_ns is 0x748 and tcp_clock_cache is
0x750.  So you need to use RBX (reg3) to access the fields in the
tcp_sock.  But the code used RDI (reg5) as it has the same value.

  $ pahole --hex -C tcp_sock vmlinux | grep -e 748 -e 750
	u64                tcp_wstamp_ns;        /* 0x748   0x8 */
	u64                tcp_clock_cache;      /* 0x750   0x8 */

And this is the disassembly of the part of the function.

  <__tcp_transmit_skb>:
  ...
  44:  mov    %rdi, %rbx
  47:  mov    0x748(%rdi), %rsi
  4e:  mov    0x750(%rdi), %rax
  55:  cmp    %rax, %rsi

Because compiler put the debug info to RBX, it only knows RDI is a
pointer to sock and accessing those two fields resulted in error
due to offset being beyond the type size.

  -----------------------------------------------------------
  find data type for 0x748(reg5) at __tcp_transmit_skb+0x63
  CU for net/ipv4/tcp_output.c (die:0x817f543)
  frame base: cfa=0 fbreg=6
  scope: [1/1] (die:81aac3e)
  bb: [0 - 30]
  var [0] -0x98(stack) type='struct tcp_out_options' size=0x28 (die:0x81af3df)
  var [5] reg8 type='unsigned int' size=0x4 (die:0x8180ed6)
  var [5] reg2 type='unsigned int' size=0x4 (die:0x8180ed6)
  var [5] reg1 type='int' size=0x4 (die:0x818059e)
  var [5] reg4 type='struct sk_buff*' size=0x8 (die:0x8181360)
  var [5] reg5 type='struct sock*' size=0x8 (die:0x8181a0c)                   <<<--- the first argument ('sk' at %RDI)
  mov [19] reg8 -> -0xa8(stack) type='unsigned int' size=0x4 (die:0x8180ed6)
  mov [20] stack canary -> reg0
  mov [29] reg0 -> -0x30(stack) stack canary
  bb: [36 - 3e]
  mov [36] reg4 -> reg15 type='struct sk_buff*' size=0x8 (die:0x8181360)
  bb: [44 - 63]
  mov [44] reg5 -> reg3 type='struct sock*' size=0x8 (die:0x8181a0c)          <<<--- calling tcp_sk()
  var [47] reg3 type='struct tcp_sock*' size=0x8 (die:0x819eead)              <<<--- new variable ('tp' at %RBX)
  var [4e] reg4 type='unsigned long long' size=0x8 (die:0x8180edd)
  mov [58] reg4 -> -0xc0(stack) type='unsigned long long' size=0x8 (die:0x8180edd)
  chk [63] reg5 offset=0x748 ok=1 kind=1 (struct sock*) : offset bigger than size    <<<--- access with old variable
  final result: offset bigger than size

While it's a fault in the compiler, we could work around this issue by
using the type of new variable when it's copied directly.  So I've added
copied_from field in the register state to track those direct register
to register copies.  After that new register gets a new type and the old
register still has the same type, it'll update (copy it back) the type
of the old register.

For example, if we can update type of reg5 at __tcp_transmit_skb+0x47,
we can find the target type of the instruction at 0x63 like below:

  -----------------------------------------------------------
  find data type for 0x748(reg5) at __tcp_transmit_skb+0x63
  ...
  bb: [44 - 63]
  mov [44] reg5 -> reg3 type='struct sock*' size=0x8 (die:0x8181a0c)
  var [47] reg3 type='struct tcp_sock*' size=0x8 (die:0x819eead)
  var [47] copyback reg5 type='struct tcp_sock*' size=0x8 (die:0x819eead)     <<<--- here
  mov [47] 0x748(reg5) -> reg4 type='unsigned long long' size=0x8 (die:0x8180edd)
  mov [4e] 0x750(reg5) -> reg0 type='unsigned long long' size=0x8 (die:0x8180edd)
  mov [58] reg4 -> -0xc0(stack) type='unsigned long long' size=0x8 (die:0x8180edd)
  chk [63] reg5 offset=0x748 ok=1 kind=1 (struct tcp_sock*) : Good!           <<<--- new type
  found by insn track: 0x748(reg5) type-offset=0x748
  final result:  type='struct tcp_sock' size=0xa98 (die:0x819eeb2)

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240821232628.353177-5-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/annotate/instructions.c |  8 ++++++++
 tools/perf/util/annotate-data.c             | 31 +++++++++++++++++++++++++++++
 tools/perf/util/annotate-data.h             |  1 +
 3 files changed, 40 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
index 15dfc2988e24..5caf5a17f03d 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -267,6 +267,7 @@ static void update_insn_state_x86(struct type_state *state,
 			return;
 
 		tsr = &state->regs[dst->reg1];
+		tsr->copied_from = -1;
 
 		if (src->imm)
 			imm_value = src->offset;
@@ -326,6 +327,8 @@ static void update_insn_state_x86(struct type_state *state,
 			return;
 
 		tsr = &state->regs[dst->reg1];
+		tsr->copied_from = -1;
+
 		if (dso__kernel(map__dso(dloc->ms->map)) &&
 		    src->segment == INSN_SEG_X86_GS && src->imm) {
 			u64 ip = dloc->ms->sym->start + dl->al.offset;
@@ -386,6 +389,10 @@ static void update_insn_state_x86(struct type_state *state,
 		tsr->imm_value = state->regs[src->reg1].imm_value;
 		tsr->ok = true;
 
+		/* To copy back the variable type later (hopefully) */
+		if (tsr->kind == TSR_KIND_TYPE)
+			tsr->copied_from = src->reg1;
+
 		pr_debug_dtp("mov [%x] reg%d -> reg%d",
 			     insn_offset, src->reg1, dst->reg1);
 		pr_debug_type_name(&tsr->type, tsr->kind);
@@ -398,6 +405,7 @@ static void update_insn_state_x86(struct type_state *state,
 			return;
 
 		tsr = &state->regs[dst->reg1];
+		tsr->copied_from = -1;
 
 retry:
 		/* Check stack variables with offset */
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 5d23c30b3a7b..a0ea4e07e570 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -774,6 +774,11 @@ ok:
 	return true;
 }
 
+static bool die_is_same(Dwarf_Die *die_a, Dwarf_Die *die_b)
+{
+	return (die_a->cu == die_b->cu) && (die_a->addr == die_b->addr);
+}
+
 /**
  * update_var_state - Update type state using given variables
  * @state: type state table
@@ -825,6 +830,7 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 			pr_debug_type_name(&mem_die, TSR_KIND_TYPE);
 		} else if (has_reg_type(state, var->reg) && var->offset == 0) {
 			struct type_state_reg *reg;
+			Dwarf_Die orig_type;
 
 			reg = &state->regs[var->reg];
 
@@ -832,6 +838,8 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 			    !is_better_type(&reg->type, &mem_die))
 				continue;
 
+			orig_type = reg->type;
+
 			reg->type = mem_die;
 			reg->kind = TSR_KIND_TYPE;
 			reg->ok = true;
@@ -839,6 +847,29 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 			pr_debug_dtp("var [%"PRIx64"] reg%d",
 				     insn_offset, var->reg);
 			pr_debug_type_name(&mem_die, TSR_KIND_TYPE);
+
+			/*
+			 * If this register is directly copied from another and it gets a
+			 * better type, also update the type of the source register.  This
+			 * is usually the case of container_of() macro with offset of 0.
+			 */
+			if (has_reg_type(state, reg->copied_from)) {
+				struct type_state_reg *copy_reg;
+
+				copy_reg = &state->regs[reg->copied_from];
+
+				/* TODO: check if type is compatible or embedded */
+				if (!copy_reg->ok || (copy_reg->kind != TSR_KIND_TYPE) ||
+				    !die_is_same(&copy_reg->type, &orig_type) ||
+				    !is_better_type(&copy_reg->type, &mem_die))
+					continue;
+
+				copy_reg->type = mem_die;
+
+				pr_debug_dtp("var [%"PRIx64"] copyback reg%d",
+					     insn_offset, reg->copied_from);
+				pr_debug_type_name(&mem_die, TSR_KIND_TYPE);
+			}
 		}
 	}
 }
diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index 37a1a3b68e0b..8ac0fd94a0ba 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -176,6 +176,7 @@ struct type_state_reg {
 	bool ok;
 	bool caller_saved;
 	u8 kind;
+	u8 copied_from;
 };
 
 /* Type information in a stack location, dynamically allocated */
-- 
cgit v1.2.3


From b81162302001f41157f6e93654aaccc30e817e2a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 22 Aug 2024 14:13:49 -0300
Subject: perf python: Allow checking for the existence of warning options in
 clang

We'll need to check if an warning option introduced in clang 19 is
available on the clang version being used, so cover the error message
emitted when testing for a -W option.

Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/CA+icZUVtHn8X1Tb_Y__c-WswsO0K8U9uy3r2MzKXwTA5THtL7w@mail.gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 142e9d447ce7..26c0f2614fe9 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -17,7 +17,7 @@ src_feature_tests  = getenv('srctree') + '/tools/build/feature'
 
 def clang_has_option(option):
     cc_output = Popen([cc, cc_options + option, path.join(src_feature_tests, "test-hello.c") ], stderr=PIPE).stderr.readlines()
-    return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o))] == [ ]
+    return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o) or (b"unknown warning option" in o))] == [ ]
 
 if cc_is_clang:
     from sysconfig import get_config_vars
-- 
cgit v1.2.3


From 00dc514612fe98cfa117193b9df28f15e7c9db9c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 22 Aug 2024 14:13:49 -0300
Subject: perf python: Disable -Wno-cast-function-type-mismatch if present on
 clang

The -Wcast-function-type-mismatch option was introduced in clang 19 and
its enabled by default, since we use -Werror, and python bindings do
casts that are valid but trips this warning, disable it if present.

Closes: https://lore.kernel.org/all/CA+icZUXoJ6BS3GMhJHV3aZWyb5Cz2haFneX0C5pUMUUhG-UVKQ@mail.gmail.com
Reported-by: Sedat Dilek <sedat.dilek@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org # To allow building with the upcoming clang 19
Link: https://lore.kernel.org/lkml/CA+icZUVtHn8X1Tb_Y__c-WswsO0K8U9uy3r2MzKXwTA5THtL7w@mail.gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/setup.py | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 26c0f2614fe9..649550e9b7aa 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -63,6 +63,8 @@ cflags = getenv('CFLAGS', '').split()
 cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter', '-Wno-redundant-decls' ]
 if cc_is_clang:
     cflags += ["-Wno-unused-command-line-argument" ]
+    if clang_has_option("-Wno-cast-function-type-mismatch"):
+        cflags += ["-Wno-cast-function-type-mismatch" ]
 else:
     cflags += ['-Wno-cast-function-type' ]
 
-- 
cgit v1.2.3


From 7559a7a84ef83a2dd86caf623430b8d834843cec Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Sun, 28 Jul 2024 19:46:12 +0800
Subject: selftests/bpf: Add testcase for updating attached freplace prog to
 prog_array map

Add a selftest to confirm the issue, which gets -EINVAL when update
attached freplace prog to prog_array map, has been fixed.

cd tools/testing/selftests/bpf; ./test_progs -t tailcalls
328/25  tailcalls/tailcall_freplace:OK
328     tailcalls:OK
Summary: 1/25 PASSED, 0 SKIPPED, 0 FAILED

Acked-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20240728114612.48486-3-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/tailcalls.c | 65 +++++++++++++++++++++-
 .../selftests/bpf/progs/tailcall_freplace.c        | 23 ++++++++
 tools/testing/selftests/bpf/progs/tc_bpf2bpf.c     | 22 ++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_freplace.c
 create mode 100644 tools/testing/selftests/bpf/progs/tc_bpf2bpf.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index e01fabb8cc41..21c5a37846ad 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -5,7 +5,8 @@
 #include "tailcall_poke.skel.h"
 #include "tailcall_bpf2bpf_hierarchy2.skel.h"
 #include "tailcall_bpf2bpf_hierarchy3.skel.h"
-
+#include "tailcall_freplace.skel.h"
+#include "tc_bpf2bpf.skel.h"
 
 /* test_tailcall_1 checks basic functionality by patching multiple locations
  * in a single program for a single tail call slot with nop->jmp, jmp->nop
@@ -1495,6 +1496,66 @@ static void test_tailcall_bpf2bpf_hierarchy_3(void)
 	RUN_TESTS(tailcall_bpf2bpf_hierarchy3);
 }
 
+/* test_tailcall_freplace checks that the attached freplace prog is OK to
+ * update the prog_array map.
+ */
+static void test_tailcall_freplace(void)
+{
+	struct tailcall_freplace *freplace_skel = NULL;
+	struct bpf_link *freplace_link = NULL;
+	struct bpf_program *freplace_prog;
+	struct tc_bpf2bpf *tc_skel = NULL;
+	int prog_fd, map_fd;
+	char buff[128] = {};
+	int err, key;
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		    .data_in = buff,
+		    .data_size_in = sizeof(buff),
+		    .repeat = 1,
+	);
+
+	freplace_skel = tailcall_freplace__open();
+	if (!ASSERT_OK_PTR(freplace_skel, "tailcall_freplace__open"))
+		return;
+
+	tc_skel = tc_bpf2bpf__open_and_load();
+	if (!ASSERT_OK_PTR(tc_skel, "tc_bpf2bpf__open_and_load"))
+		goto out;
+
+	prog_fd = bpf_program__fd(tc_skel->progs.entry_tc);
+	freplace_prog = freplace_skel->progs.entry_freplace;
+	err = bpf_program__set_attach_target(freplace_prog, prog_fd, "subprog");
+	if (!ASSERT_OK(err, "set_attach_target"))
+		goto out;
+
+	err = tailcall_freplace__load(freplace_skel);
+	if (!ASSERT_OK(err, "tailcall_freplace__load"))
+		goto out;
+
+	freplace_link = bpf_program__attach_freplace(freplace_prog, prog_fd,
+						     "subprog");
+	if (!ASSERT_OK_PTR(freplace_link, "attach_freplace"))
+		goto out;
+
+	map_fd = bpf_map__fd(freplace_skel->maps.jmp_table);
+	prog_fd = bpf_program__fd(freplace_prog);
+	key = 0;
+	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
+	if (!ASSERT_OK(err, "update jmp_table"))
+		goto out;
+
+	prog_fd = bpf_program__fd(tc_skel->progs.entry_tc);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, 34, "test_run retval");
+
+out:
+	bpf_link__destroy(freplace_link);
+	tc_bpf2bpf__destroy(tc_skel);
+	tailcall_freplace__destroy(freplace_skel);
+}
+
 void test_tailcalls(void)
 {
 	if (test__start_subtest("tailcall_1"))
@@ -1543,4 +1604,6 @@ void test_tailcalls(void)
 		test_tailcall_bpf2bpf_hierarchy_fentry_entry();
 	test_tailcall_bpf2bpf_hierarchy_2();
 	test_tailcall_bpf2bpf_hierarchy_3();
+	if (test__start_subtest("tailcall_freplace"))
+		test_tailcall_freplace();
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_freplace.c b/tools/testing/selftests/bpf/progs/tailcall_freplace.c
new file mode 100644
index 000000000000..6713b809df44
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_freplace.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+int count = 0;
+
+SEC("freplace")
+int entry_freplace(struct __sk_buff *skb)
+{
+	count++;
+	bpf_tail_call_static(skb, &jmp_table, 0);
+	return count;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c
new file mode 100644
index 000000000000..8a0632c37839
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+__noinline
+int subprog(struct __sk_buff *skb)
+{
+	int ret = 1;
+
+	__sink(ret);
+	return ret;
+}
+
+SEC("tc")
+int entry_tc(struct __sk_buff *skb)
+{
+	return subprog(skb);
+}
+
+char __license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 1820b84f3c61fbe5d9cf82a09cb9733a23cd463c Mon Sep 17 00:00:00 2001
From: Abhinav Jain <jain.abhinav177@gmail.com>
Date: Wed, 21 Aug 2024 22:49:01 +0530
Subject: selftests: net: Create veth pair for testing in networkless kernel

Check if the netdev list is empty and create veth pair to be used for
feature on/off testing.
Remove the veth pair after testing is complete.

Signed-off-by: Abhinav Jain <jain.abhinav177@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240821171903.118324-2-jain.abhinav177@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/netdevice.sh | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh
index e3afcb424710..999d72b6670c 100755
--- a/tools/testing/selftests/net/netdevice.sh
+++ b/tools/testing/selftests/net/netdevice.sh
@@ -129,6 +129,7 @@ kci_netdev_ethtool()
 
 	kci_netdev_ethtool_test 74 'dump' "ethtool -d $netdev"
 	kci_netdev_ethtool_test 94 'stats' "ethtool -S $netdev"
+
 	return 0
 }
 
@@ -196,10 +197,24 @@ if [ ! -e "$TMP_LIST_NETDEV" ];then
 fi
 
 ip link show |grep '^[0-9]' | grep -oE '[[:space:]].*eth[0-9]*:|[[:space:]].*enp[0-9]s[0-9]:' | cut -d\  -f2 | cut -d: -f1> "$TMP_LIST_NETDEV"
+
+if [ ! -s "$TMP_LIST_NETDEV" ]; then
+	echo "No valid network device found, creating veth pair"
+	ip link add veth0 type veth peer name veth1
+	echo "veth0" > "$TMP_LIST_NETDEV"
+	veth_created=1
+fi
+
 while read netdev
 do
 	kci_test_netdev "$netdev"
 done < "$TMP_LIST_NETDEV"
 
+#clean up veth interface pair if it was created
+if [ "$veth_created" ]; then
+	ip link delete veth0
+	echo "Removed veth pair"
+fi
+
 rm "$TMP_LIST_NETDEV"
 exit 0
-- 
cgit v1.2.3


From 6ce7bdbc0d4bde7578727e95e7d138e364512b33 Mon Sep 17 00:00:00 2001
From: Abhinav Jain <jain.abhinav177@gmail.com>
Date: Wed, 21 Aug 2024 22:49:02 +0530
Subject: selftests: net: Add on/off checks for non-fixed features of interface

Implement on/off testing for all non-fixed features via while loop.

Signed-off-by: Abhinav Jain <jain.abhinav177@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240821171903.118324-3-jain.abhinav177@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/netdevice.sh | 35 +++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh
index 999d72b6670c..f7752e1ebe22 100755
--- a/tools/testing/selftests/net/netdevice.sh
+++ b/tools/testing/selftests/net/netdevice.sh
@@ -124,7 +124,40 @@ kci_netdev_ethtool()
 		return 1
 	fi
 	echo "PASS: $netdev: ethtool list features"
-	#TODO for each non fixed features, try to turn them on/off
+
+	while read -r FEATURE VALUE FIXED; do
+		[ "$FEATURE" != "Features" ] || continue # Skip "Features"
+		[ "$FIXED" != "[fixed]" ] || continue # Skip fixed features
+		feature="${FEATURE%:*}"
+
+		ethtool --offload "$netdev" "$feature" off
+		if [ $? -eq 0 ]; then
+			echo "PASS: $netdev: Turned off feature: $feature"
+		else
+			echo "FAIL: $netdev: Failed to turn off feature:" \
+				"$feature"
+		fi
+
+		ethtool --offload "$netdev" "$feature" on
+		if [ $? -eq 0 ]; then
+			echo "PASS: $netdev: Turned on feature: $feature"
+		else
+			echo "FAIL: $netdev: Failed to turn on feature:" \
+				"$feature"
+		fi
+
+		#restore the feature to its initial state
+		ethtool --offload "$netdev" "$feature" "$VALUE"
+		if [ $? -eq 0 ]; then
+			echo "PASS: $netdev: Restore feature $feature" \
+				"to initial state $VALUE"
+		else
+			echo "FAIL: $netdev: Failed to restore feature" \
+				"$feature to initial state $VALUE"
+		fi
+
+	done < "$TMP_ETHTOOL_FEATURES"
+
 	rm "$TMP_ETHTOOL_FEATURES"
 
 	kci_netdev_ethtool_test 74 'dump' "ethtool -d $netdev"
-- 
cgit v1.2.3


From 8402a158028ffe030f3a02ec57b8c94cb94b137c Mon Sep 17 00:00:00 2001
From: Abhinav Jain <jain.abhinav177@gmail.com>
Date: Wed, 21 Aug 2024 22:49:03 +0530
Subject: selftests: net: Use XFAIL for operations not supported by the driver

Check if veth pair was created and if yes, xfail on setting IP address
logging an informational message.
Use XFAIL instead of SKIP for unsupported ethtool APIs.

Signed-off-by: Abhinav Jain <jain.abhinav177@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240821171903.118324-4-jain.abhinav177@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/netdevice.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh
index f7752e1ebe22..438f7b2acc5f 100755
--- a/tools/testing/selftests/net/netdevice.sh
+++ b/tools/testing/selftests/net/netdevice.sh
@@ -67,8 +67,12 @@ kci_net_setup()
 		return $ksft_skip
 	fi
 
-	# TODO what ipaddr to set ? DHCP ?
-	echo "SKIP: $netdev: set IP address"
+	if [ "$veth_created" ]; then
+		echo "XFAIL: $netdev: set IP address unsupported for veth*"
+	else
+		# TODO what ipaddr to set ? DHCP ?
+		echo "SKIP: $netdev: set IP address"
+	fi
 	return $ksft_skip
 }
 
@@ -86,7 +90,7 @@ kci_netdev_ethtool_test()
 	ret=$?
 	if [ $ret -ne 0 ];then
 		if [ $ret -eq "$1" ];then
-			echo "SKIP: $netdev: ethtool $2 not supported"
+			echo "XFAIL: $netdev: ethtool $2 not supported"
 			return $ksft_skip
 		else
 			echo "FAIL: $netdev: ethtool $2"
-- 
cgit v1.2.3


From 5a98c2e5399b125231ebb4594fee5fddfb7db9fd Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca.ceresoli@bootlin.com>
Date: Fri, 23 Aug 2024 09:45:59 +0200
Subject: ASoC: dapm-graph: remove the "ROOT" cluster

Widgets not belonging to any component are currently represented inside a
cluster labeled "ROOT". This is not a correct representation of the actual
structure, as these widgets are not necessarily related to each other as
the ones inside actual components are.

Improve the graphical representation by not adding a cluster around these
widgets. Now a dot cluster represents a card component faithfully. This
will be particularly important with the upcoming improvements which will
visualize the component bias_level.

Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Link: https://patch.msgid.link/20240823-dapm-graph-v1-1-989a47308c4c@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 tools/sound/dapm-graph | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph
index 57d78f6df041..205783d124d3 100755
--- a/tools/sound/dapm-graph
+++ b/tools/sound/dapm-graph
@@ -150,29 +150,32 @@ process_dapm_widget()
 #
 # $1 = temporary work dir
 # $2 = component directory
-# $3 = forced component name (extracted for path if empty)
+# $3 = "ROOT" for the root card directory, empty otherwise
 process_dapm_component()
 {
     local tmp_dir="${1}"
     local c_dir="${2}"
     local c_name="${3}"
+    local is_component=0
     local dot_file="${tmp_dir}/main.dot"
     local links_file="${tmp_dir}/links.dot"
 
     if [ -z "${c_name}" ]; then
+	is_component=1
+
 	# Extract directory name into component name:
 	#   "./cs42l51.0-004a/dapm" -> "cs42l51.0-004a"
 	c_name="$(basename $(dirname "${c_dir}"))"
+
+	echo ""                           >> "${dot_file}"
+	echo "  subgraph \"${c_name}\" {" >> "${dot_file}"
+	echo "    cluster = true"         >> "${dot_file}"
+	echo "    label = \"${c_name}\""  >> "${dot_file}"
+	echo "    color=dodgerblue"       >> "${dot_file}"
     fi
 
     dbg_echo " * Component: ${c_name}"
 
-    echo ""                           >> "${dot_file}"
-    echo "  subgraph \"${c_name}\" {" >> "${dot_file}"
-    echo "    cluster = true"         >> "${dot_file}"
-    echo "    label = \"${c_name}\""  >> "${dot_file}"
-    echo "    color=dodgerblue"       >> "${dot_file}"
-
     # Create empty file to ensure it will exist in all cases
     >"${links_file}"
 
@@ -181,7 +184,9 @@ process_dapm_component()
 	process_dapm_widget "${tmp_dir}" "${c_name}" "${w_file}"
     done
 
-    echo "  }" >> "${dot_file}"
+    if [ ${is_component} = 1 ]; then
+	echo "  }" >> "${dot_file}"
+    fi
 
     cat "${links_file}" >> "${dot_file}"
 }
-- 
cgit v1.2.3


From 64a1e3ddab1ebaa590101b0d7d7fa5d3144da1e8 Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca.ceresoli@bootlin.com>
Date: Fri, 23 Aug 2024 09:46:00 +0200
Subject: ASoC: dapm-graph: visualize component On/Off bias level

Read the bias_level debugfs files (ignored so far) and visualize the On/Off
state of each component using different graphic attributes in the generated
graph.

Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Link: https://patch.msgid.link/20240823-dapm-graph-v1-2-989a47308c4c@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 tools/sound/dapm-graph | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph
index 205783d124d3..4e90883912d0 100755
--- a/tools/sound/dapm-graph
+++ b/tools/sound/dapm-graph
@@ -8,6 +8,8 @@
 
 set -eu
 
+STYLE_COMPONENT_ON="color=dodgerblue;style=bold"
+STYLE_COMPONENT_OFF="color=gray40;style=filled;fillcolor=gray90"
 STYLE_NODE_ON="shape=box,style=bold,color=green4"
 STYLE_NODE_OFF="shape=box,style=filled,color=gray30,fillcolor=gray95"
 
@@ -159,6 +161,7 @@ process_dapm_component()
     local is_component=0
     local dot_file="${tmp_dir}/main.dot"
     local links_file="${tmp_dir}/links.dot"
+    local c_attribs=""
 
     if [ -z "${c_name}" ]; then
 	is_component=1
@@ -166,16 +169,28 @@ process_dapm_component()
 	# Extract directory name into component name:
 	#   "./cs42l51.0-004a/dapm" -> "cs42l51.0-004a"
 	c_name="$(basename $(dirname "${c_dir}"))"
+    fi
+
+    dbg_echo " * Component: ${c_name}"
+
+    if [ ${is_component} = 1 ]; then
+	if [ -f "${c_dir}/bias_level" ]; then
+	    c_onoff=$(sed -n -e 1p "${c_dir}/bias_level" | awk '{print $1}')
+	    dbg_echo "   - bias_level: ${c_onoff}"
+	    if [ "$c_onoff" = "On" ]; then
+		c_attribs="${STYLE_COMPONENT_ON}"
+	    elif [ "$c_onoff" = "Off" ]; then
+		c_attribs="${STYLE_COMPONENT_OFF}"
+	    fi
+	fi
 
 	echo ""                           >> "${dot_file}"
 	echo "  subgraph \"${c_name}\" {" >> "${dot_file}"
 	echo "    cluster = true"         >> "${dot_file}"
 	echo "    label = \"${c_name}\""  >> "${dot_file}"
-	echo "    color=dodgerblue"       >> "${dot_file}"
+	echo "    ${c_attribs}"           >> "${dot_file}"
     fi
 
-    dbg_echo " * Component: ${c_name}"
-
     # Create empty file to ensure it will exist in all cases
     >"${links_file}"
 
-- 
cgit v1.2.3


From a14b278a47dd4b263799214c5ae0da6506ed7692 Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca.ceresoli@bootlin.com>
Date: Fri, 23 Aug 2024 09:46:01 +0200
Subject: ASoC: dapm-graph: show path name for non-static routes

Many routes are just static, not modifiable at runtime. Show the route name
for all the other routes as an edge label in the generated graph.

Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Link: https://patch.msgid.link/20240823-dapm-graph-v1-3-989a47308c4c@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 tools/sound/dapm-graph | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph
index 4e90883912d0..f14bdfedee8f 100755
--- a/tools/sound/dapm-graph
+++ b/tools/sound/dapm-graph
@@ -134,11 +134,17 @@ process_dapm_widget()
 	    # Collect any links. We could use "in" links or "out" links,
 	    # let's use "in" links
 	    if echo "${line}" | grep -q '^in '; then
+		local w_route=$(echo "$line" | awk -F\" '{print $2}')
 		local w_src=$(echo "$line" |
 				  awk -F\" '{print $6 "_" $4}' |
 				  sed  's/^(null)_/ROOT_/')
 		dbg_echo "     - Input route from: ${w_src}"
-		echo "  \"${w_src}\" -> \"$w_tag\"" >> "${links_file}"
+		dbg_echo "     - Route: ${w_route}"
+		local w_edge_attrs=""
+		if [ "${w_route}" != "static" ]; then
+		    w_edge_attrs=" [label=\"${w_route}\"]"
+		fi
+		echo "  \"${w_src}\" -> \"$w_tag\"${w_edge_attrs}" >> "${links_file}"
 	    fi
 	done
 
@@ -220,7 +226,7 @@ process_dapm_tree()
     echo "digraph G {" > "${dot_file}"
     echo "  fontname=\"sans-serif\"" >> "${dot_file}"
     echo "  node [fontname=\"sans-serif\"]" >> "${dot_file}"
-
+    echo "  edge [fontname=\"sans-serif\"]" >> "${dot_file}"
 
     # Process root directory (no component)
     process_dapm_component "${tmp_dir}" "${dapm_dir}/dapm" "ROOT"
-- 
cgit v1.2.3


From 5225b6562b9a7dc808d5a1e465aaf5e2ebb220cd Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Wed, 21 Aug 2024 17:44:01 +0100
Subject: kselftest/arm64: signal: fix/refactor SVE vector length enumeration

Currently a number of SVE/SME related tests have almost identical
functions to enumerate all supported vector lengths. However over time
the copy&pasted code has diverged, allowing some bugs to creep in:
- fake_sigreturn_sme_change_vl reports a failure, not a SKIP if only
  one vector length is supported (but the SVE version is fine)
- fake_sigreturn_sme_change_vl tries to set the SVE vector length, not
  the SME one (but the other SME tests are fine)
- za_no_regs keeps iterating forever if only one vector length is
  supported (but za_regs is correct)

Since those bugs seem to be mostly copy&paste ones, let's consolidate
the enumeration loop into one shared function, and just call that from
each test. That should fix the above bugs, and prevent similar issues
from happening again.

Fixes: 4963aeb35a9e ("kselftest/arm64: signal: Add SME signal handling tests")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240821164401.3598545-1-andre.przywara@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/signal/Makefile      |  2 +-
 tools/testing/selftests/arm64/signal/sve_helpers.c | 56 ++++++++++++++++++++++
 tools/testing/selftests/arm64/signal/sve_helpers.h | 21 ++++++++
 .../testcases/fake_sigreturn_sme_change_vl.c       | 32 ++++---------
 .../testcases/fake_sigreturn_sve_change_vl.c       | 30 +++---------
 .../selftests/arm64/signal/testcases/ssve_regs.c   | 36 ++++----------
 .../arm64/signal/testcases/ssve_za_regs.c          | 36 ++++----------
 .../selftests/arm64/signal/testcases/sve_regs.c    | 32 ++++---------
 .../selftests/arm64/signal/testcases/za_no_regs.c  | 32 ++++---------
 .../selftests/arm64/signal/testcases/za_regs.c     | 36 ++++----------
 10 files changed, 132 insertions(+), 181 deletions(-)
 create mode 100644 tools/testing/selftests/arm64/signal/sve_helpers.c
 create mode 100644 tools/testing/selftests/arm64/signal/sve_helpers.h

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile
index 8f5febaf1a9a..edb3613513b8 100644
--- a/tools/testing/selftests/arm64/signal/Makefile
+++ b/tools/testing/selftests/arm64/signal/Makefile
@@ -23,7 +23,7 @@ $(TEST_GEN_PROGS): $(PROGS)
 # Common test-unit targets to build common-layout test-cases executables
 # Needs secondary expansion to properly include the testcase c-file in pre-reqs
 COMMON_SOURCES := test_signals.c test_signals_utils.c testcases/testcases.c \
-	signals.S
+	signals.S sve_helpers.c
 COMMON_HEADERS := test_signals.h test_signals_utils.h testcases/testcases.h
 
 .SECONDEXPANSION:
diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.c b/tools/testing/selftests/arm64/signal/sve_helpers.c
new file mode 100644
index 000000000000..0acc121af306
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/sve_helpers.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 ARM Limited
+ *
+ * Common helper functions for SVE and SME functionality.
+ */
+
+#include <stdbool.h>
+#include <kselftest.h>
+#include <asm/sigcontext.h>
+#include <sys/prctl.h>
+
+unsigned int vls[SVE_VQ_MAX];
+unsigned int nvls;
+
+int sve_fill_vls(bool use_sme, int min_vls)
+{
+	int vq, vl;
+	int pr_set_vl = use_sme ? PR_SME_SET_VL : PR_SVE_SET_VL;
+	int len_mask = use_sme ? PR_SME_VL_LEN_MASK : PR_SVE_VL_LEN_MASK;
+
+	/*
+	 * Enumerate up to SVE_VQ_MAX vector lengths
+	 */
+	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+		vl = prctl(pr_set_vl, vq * 16);
+		if (vl == -1)
+			return KSFT_FAIL;
+
+		vl &= len_mask;
+
+		/*
+		 * Unlike SVE, SME does not require the minimum vector length
+		 * to be implemented, or the VLs to be consecutive, so any call
+		 * to the prctl might return the single implemented VL, which
+		 * might be larger than 16. So to avoid this loop never
+		 * terminating,  bail out here when we find a higher VL than
+		 * we asked for.
+		 * See the ARM ARM, DDI 0487K.a, B1.4.2: I_QQRNR and I_NWYBP.
+		 */
+		if (vq < sve_vq_from_vl(vl))
+			break;
+
+		/* Skip missing VLs */
+		vq = sve_vq_from_vl(vl);
+
+		vls[nvls++] = vl;
+	}
+
+	if (nvls < min_vls) {
+		fprintf(stderr, "Only %d VL supported\n", nvls);
+		return KSFT_SKIP;
+	}
+
+	return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.h b/tools/testing/selftests/arm64/signal/sve_helpers.h
new file mode 100644
index 000000000000..50948ce471cc
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/sve_helpers.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 ARM Limited
+ *
+ * Common helper functions for SVE and SME functionality.
+ */
+
+#ifndef __SVE_HELPERS_H__
+#define __SVE_HELPERS_H__
+
+#include <stdbool.h>
+
+#define VLS_USE_SVE	false
+#define VLS_USE_SME	true
+
+extern unsigned int vls[];
+extern unsigned int nvls;
+
+int sve_fill_vls(bool use_sme, int min_vls);
+
+#endif
diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c
index ebd5815b54bb..cb8c051b5c8f 100644
--- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c
@@ -6,44 +6,28 @@
  * handler, this is not supported and is expected to segfault.
  */
 
+#include <kselftest.h>
 #include <signal.h>
 #include <ucontext.h>
 #include <sys/prctl.h>
 
 #include "test_signals_utils.h"
+#include "sve_helpers.h"
 #include "testcases.h"
 
 struct fake_sigframe sf;
-static unsigned int vls[SVE_VQ_MAX];
-unsigned int nvls = 0;
 
 static bool sme_get_vls(struct tdescr *td)
 {
-	int vq, vl;
+	int res = sve_fill_vls(VLS_USE_SME, 2);
 
-	/*
-	 * Enumerate up to SVE_VQ_MAX vector lengths
-	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
-		vl = prctl(PR_SVE_SET_VL, vq * 16);
-		if (vl == -1)
-			return false;
+	if (!res)
+		return true;
 
-		vl &= PR_SME_VL_LEN_MASK;
+	if (res == KSFT_SKIP)
+		td->result = KSFT_SKIP;
 
-		/* Skip missing VLs */
-		vq = sve_vq_from_vl(vl);
-
-		vls[nvls++] = vl;
-	}
-
-	/* We need at least two VLs */
-	if (nvls < 2) {
-		fprintf(stderr, "Only %d VL supported\n", nvls);
-		return false;
-	}
-
-	return true;
+	return false;
 }
 
 static int fake_sigreturn_ssve_change_vl(struct tdescr *td,
diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c
index e2a452190511..e1ccf8f85a70 100644
--- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c
@@ -12,40 +12,22 @@
 #include <sys/prctl.h>
 
 #include "test_signals_utils.h"
+#include "sve_helpers.h"
 #include "testcases.h"
 
 struct fake_sigframe sf;
-static unsigned int vls[SVE_VQ_MAX];
-unsigned int nvls = 0;
 
 static bool sve_get_vls(struct tdescr *td)
 {
-	int vq, vl;
+	int res = sve_fill_vls(VLS_USE_SVE, 2);
 
-	/*
-	 * Enumerate up to SVE_VQ_MAX vector lengths
-	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
-		vl = prctl(PR_SVE_SET_VL, vq * 16);
-		if (vl == -1)
-			return false;
+	if (!res)
+		return true;
 
-		vl &= PR_SVE_VL_LEN_MASK;
-
-		/* Skip missing VLs */
-		vq = sve_vq_from_vl(vl);
-
-		vls[nvls++] = vl;
-	}
-
-	/* We need at least two VLs */
-	if (nvls < 2) {
-		fprintf(stderr, "Only %d VL supported\n", nvls);
+	if (res == KSFT_SKIP)
 		td->result = KSFT_SKIP;
-		return false;
-	}
 
-	return true;
+	return false;
 }
 
 static int fake_sigreturn_sve_change_vl(struct tdescr *td,
diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
index 3d37daafcff5..6dbe48cf8b09 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
@@ -6,51 +6,31 @@
  * set up as expected.
  */
 
+#include <kselftest.h>
 #include <signal.h>
 #include <ucontext.h>
 #include <sys/prctl.h>
 
 #include "test_signals_utils.h"
+#include "sve_helpers.h"
 #include "testcases.h"
 
 static union {
 	ucontext_t uc;
 	char buf[1024 * 64];
 } context;
-static unsigned int vls[SVE_VQ_MAX];
-unsigned int nvls = 0;
 
 static bool sme_get_vls(struct tdescr *td)
 {
-	int vq, vl;
+	int res = sve_fill_vls(VLS_USE_SME, 1);
 
-	/*
-	 * Enumerate up to SVE_VQ_MAX vector lengths
-	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
-		vl = prctl(PR_SME_SET_VL, vq * 16);
-		if (vl == -1)
-			return false;
-
-		vl &= PR_SME_VL_LEN_MASK;
-
-		/* Did we find the lowest supported VL? */
-		if (vq < sve_vq_from_vl(vl))
-			break;
+	if (!res)
+		return true;
 
-		/* Skip missing VLs */
-		vq = sve_vq_from_vl(vl);
-
-		vls[nvls++] = vl;
-	}
-
-	/* We need at least one VL */
-	if (nvls < 1) {
-		fprintf(stderr, "Only %d VL supported\n", nvls);
-		return false;
-	}
+	if (res == KSFT_SKIP)
+		td->result = KSFT_SKIP;
 
-	return true;
+	return false;
 }
 
 static void setup_ssve_regs(void)
diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
index 9dc5f128bbc0..5557e116e973 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
@@ -6,51 +6,31 @@
  * signal frames is set up as expected when enabled simultaneously.
  */
 
+#include <kselftest.h>
 #include <signal.h>
 #include <ucontext.h>
 #include <sys/prctl.h>
 
 #include "test_signals_utils.h"
+#include "sve_helpers.h"
 #include "testcases.h"
 
 static union {
 	ucontext_t uc;
 	char buf[1024 * 128];
 } context;
-static unsigned int vls[SVE_VQ_MAX];
-unsigned int nvls = 0;
 
 static bool sme_get_vls(struct tdescr *td)
 {
-	int vq, vl;
+	int res = sve_fill_vls(VLS_USE_SME, 1);
 
-	/*
-	 * Enumerate up to SVE_VQ_MAX vector lengths
-	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
-		vl = prctl(PR_SME_SET_VL, vq * 16);
-		if (vl == -1)
-			return false;
-
-		vl &= PR_SME_VL_LEN_MASK;
-
-		/* Did we find the lowest supported VL? */
-		if (vq < sve_vq_from_vl(vl))
-			break;
+	if (!res)
+		return true;
 
-		/* Skip missing VLs */
-		vq = sve_vq_from_vl(vl);
-
-		vls[nvls++] = vl;
-	}
-
-	/* We need at least one VL */
-	if (nvls < 1) {
-		fprintf(stderr, "Only %d VL supported\n", nvls);
-		return false;
-	}
+	if (res == KSFT_SKIP)
+		td->result = KSFT_SKIP;
 
-	return true;
+	return false;
 }
 
 static void setup_regs(void)
diff --git a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c
index 8b16eabbb769..8143eb1c58c1 100644
--- a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c
@@ -6,47 +6,31 @@
  * expected.
  */
 
+#include <kselftest.h>
 #include <signal.h>
 #include <ucontext.h>
 #include <sys/prctl.h>
 
 #include "test_signals_utils.h"
+#include "sve_helpers.h"
 #include "testcases.h"
 
 static union {
 	ucontext_t uc;
 	char buf[1024 * 64];
 } context;
-static unsigned int vls[SVE_VQ_MAX];
-unsigned int nvls = 0;
 
 static bool sve_get_vls(struct tdescr *td)
 {
-	int vq, vl;
+	int res = sve_fill_vls(VLS_USE_SVE, 1);
 
-	/*
-	 * Enumerate up to SVE_VQ_MAX vector lengths
-	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
-		vl = prctl(PR_SVE_SET_VL, vq * 16);
-		if (vl == -1)
-			return false;
-
-		vl &= PR_SVE_VL_LEN_MASK;
-
-		/* Skip missing VLs */
-		vq = sve_vq_from_vl(vl);
+	if (!res)
+		return true;
 
-		vls[nvls++] = vl;
-	}
-
-	/* We need at least one VL */
-	if (nvls < 1) {
-		fprintf(stderr, "Only %d VL supported\n", nvls);
-		return false;
-	}
+	if (res == KSFT_SKIP)
+		td->result = KSFT_SKIP;
 
-	return true;
+	return false;
 }
 
 static void setup_sve_regs(void)
diff --git a/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c
index 4d6f94b6178f..ce26e9c2fa5e 100644
--- a/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c
@@ -6,47 +6,31 @@
  * expected.
  */
 
+#include <kselftest.h>
 #include <signal.h>
 #include <ucontext.h>
 #include <sys/prctl.h>
 
 #include "test_signals_utils.h"
+#include "sve_helpers.h"
 #include "testcases.h"
 
 static union {
 	ucontext_t uc;
 	char buf[1024 * 128];
 } context;
-static unsigned int vls[SVE_VQ_MAX];
-unsigned int nvls = 0;
 
 static bool sme_get_vls(struct tdescr *td)
 {
-	int vq, vl;
+	int res = sve_fill_vls(VLS_USE_SME, 1);
 
-	/*
-	 * Enumerate up to SME_VQ_MAX vector lengths
-	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
-		vl = prctl(PR_SME_SET_VL, vq * 16);
-		if (vl == -1)
-			return false;
-
-		vl &= PR_SME_VL_LEN_MASK;
-
-		/* Skip missing VLs */
-		vq = sve_vq_from_vl(vl);
+	if (!res)
+		return true;
 
-		vls[nvls++] = vl;
-	}
-
-	/* We need at least one VL */
-	if (nvls < 1) {
-		fprintf(stderr, "Only %d VL supported\n", nvls);
-		return false;
-	}
+	if (res == KSFT_SKIP)
+		td->result = KSFT_SKIP;
 
-	return true;
+	return false;
 }
 
 static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
index 174ad6656696..b9e13f27f1f9 100644
--- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
@@ -6,51 +6,31 @@
  * expected.
  */
 
+#include <kselftest.h>
 #include <signal.h>
 #include <ucontext.h>
 #include <sys/prctl.h>
 
 #include "test_signals_utils.h"
+#include "sve_helpers.h"
 #include "testcases.h"
 
 static union {
 	ucontext_t uc;
 	char buf[1024 * 128];
 } context;
-static unsigned int vls[SVE_VQ_MAX];
-unsigned int nvls = 0;
 
 static bool sme_get_vls(struct tdescr *td)
 {
-	int vq, vl;
+	int res = sve_fill_vls(VLS_USE_SME, 1);
 
-	/*
-	 * Enumerate up to SME_VQ_MAX vector lengths
-	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
-		vl = prctl(PR_SME_SET_VL, vq * 16);
-		if (vl == -1)
-			return false;
-
-		vl &= PR_SME_VL_LEN_MASK;
-
-		/* Did we find the lowest supported VL? */
-		if (vq < sve_vq_from_vl(vl))
-			break;
+	if (!res)
+		return true;
 
-		/* Skip missing VLs */
-		vq = sve_vq_from_vl(vl);
-
-		vls[nvls++] = vl;
-	}
-
-	/* We need at least one VL */
-	if (nvls < 1) {
-		fprintf(stderr, "Only %d VL supported\n", nvls);
-		return false;
-	}
+	if (res == KSFT_SKIP)
+		td->result = KSFT_SKIP;
 
-	return true;
+	return false;
 }
 
 static void setup_za_regs(void)
-- 
cgit v1.2.3


From ec1f77f6557b46639fa47c6980ef9d38995c1e05 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Fri, 23 Aug 2024 01:06:42 -0700
Subject: selftests/bpf: test_loader.c:get_current_arch() should not return 0

At the moment, when test_loader.c:get_current_arch() can't determine
the arch, it returns 0. The arch check in run_subtest() looks as
follows:

	if ((get_current_arch() & spec->arch_mask) == 0) {
		test__skip();
		return;
	}

Which means that all test_loader based tests would be skipped if arch
could not be determined. get_current_arch() recognizes x86_64, arm64
and riscv64. Which means that CI skips test_loader tests for s390.

Fix this by making sure that get_current_arch() always returns
non-zero value. In combination with default spec->arch_mask == -1 this
should cover all possibilities.

Fixes: f406026fefa7 ("selftests/bpf: by default use arch mask allowing all archs")
Fixes: 7d743e4c759c ("selftests/bpf: __jited test tag to check disassembly after jit")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240823080644.263943-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_loader.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 2ca9b73e5a6b..4223cffc090e 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -336,9 +336,10 @@ static const char *skip_dynamic_pfx(const char *s, const char *pfx)
 }
 
 enum arch {
-	ARCH_X86_64	= 0x1,
-	ARCH_ARM64	= 0x2,
-	ARCH_RISCV64	= 0x4,
+	ARCH_UNKNOWN	= 0x1,
+	ARCH_X86_64	= 0x2,
+	ARCH_ARM64	= 0x4,
+	ARCH_RISCV64	= 0x8,
 };
 
 static int get_current_arch(void)
@@ -350,7 +351,7 @@ static int get_current_arch(void)
 #elif defined(__riscv) && __riscv_xlen == 64
 	return ARCH_RISCV64;
 #endif
-	return 0;
+	return ARCH_UNKNOWN;
 }
 
 /* Uses btf_decl_tag attributes to describe the expected test
-- 
cgit v1.2.3


From c52a1e6eb74ffe4f217e9b53ed75440a45b08c4c Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Fri, 23 Aug 2024 01:06:43 -0700
Subject: selftests/bpf: match both retq/rethunk in verifier_tailcall_jit

Depending on kernel parameters, x86 jit generates either retq or jump
to rethunk for 'exit' instruction. The difference could be seen when
kernel is booted with and without mitigations=off parameter.
Relax the verifier_tailcall_jit test case to match both variants.

Fixes: e5bdd6a8be78 ("selftests/bpf: validate jit behaviour for tail calls")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240823080644.263943-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c
index 06d327cf1e1f..8d60c634a114 100644
--- a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c
+++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c
@@ -59,7 +59,7 @@ __jited("	movq	-0x10(%rbp), %rax")
 __jited("	callq	0x{{.*}}")		/* call to sub()          */
 __jited("	xorl	%eax, %eax")
 __jited("	leave")
-__jited("	retq")
+__jited("	{{(retq|jmp	0x)}}")		/* return or jump to rethunk */
 __jited("...")
 /* subprogram entry for sub(), regular function prologue */
 __jited("	endbr64")
@@ -89,7 +89,7 @@ __jited("	popq	%rax")
 __jited("	popq	%rax")
 __jited("	jmp	{{.*}}")		/* jump to tail call tgt   */
 __jited("L0:	leave")
-__jited("	retq")
+__jited("	{{(retq|jmp	0x)}}")		/* return or jump to rethunk */
 SEC("tc")
 __naked int main(void)
 {
-- 
cgit v1.2.3


From 21a56fc503faae7589167fcd2771ab6dce36ab39 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Fri, 23 Aug 2024 01:06:44 -0700
Subject: selftests/bpf: #define LOCAL_LABEL_LEN for jit_disasm_helpers.c

Extract local label length as a #define directive and
elaborate why 'i % MAX_LOCAL_LABELS' expression is needed
for local labels array initialization.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240823080644.263943-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/jit_disasm_helpers.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c
index 1b0f1fd267c0..febd6b12e372 100644
--- a/tools/testing/selftests/bpf/jit_disasm_helpers.c
+++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c
@@ -16,6 +16,11 @@
  */
 #define MAX_LOCAL_LABELS 32
 
+/* Local labels are encoded as 'L42', this requires 4 bytes of storage:
+ * 3 characters + zero byte
+ */
+#define LOCAL_LABEL_LEN 4
+
 static bool llvm_initialized;
 
 struct local_labels {
@@ -23,7 +28,7 @@ struct local_labels {
 	__u32 prog_len;
 	__u32 cnt;
 	__u32 pcs[MAX_LOCAL_LABELS];
-	char names[MAX_LOCAL_LABELS][4];
+	char names[MAX_LOCAL_LABELS][LOCAL_LABEL_LEN];
 };
 
 static const char *lookup_symbol(void *data, uint64_t ref_value, uint64_t *ref_type,
@@ -118,8 +123,14 @@ static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len)
 	}
 	qsort(labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32);
 	for (i = 0; i < labels.cnt; ++i)
-		/* use (i % 100) to avoid format truncation warning */
-		snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % 100);
+		/* gcc is unable to infer upper bound for labels.cnt and assumes
+		 * it to be U32_MAX. U32_MAX takes 10 decimal digits.
+		 * snprintf below prints into labels.names[*],
+		 * which has space only for two digits and a letter.
+		 * To avoid truncation warning use (i % MAX_LOCAL_LABELS),
+		 * which informs gcc about printed value upper bound.
+		 */
+		snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % MAX_LOCAL_LABELS);
 
 	/* now print with labels */
 	labels.print_phase = true;
-- 
cgit v1.2.3


From 4e9e07603ecdfd0816838132f354239771c51edd Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 6 Aug 2024 16:03:19 -0700
Subject: selftests/bpf: make use of PROCMAP_QUERY ioctl if available

Instead of parsing text-based /proc/<pid>/maps file, try to use
PROCMAP_QUERY ioctl() to simplify and speed up data fetching.
This logic is used to do uprobe file offset calculation, so any bugs in
this logic would manifest as failing uprobe BPF selftests.

This also serves as a simple demonstration of one of the intended uses.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240806230319.869734-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_progs.c    |   3 +
 tools/testing/selftests/bpf/test_progs.h    |   2 +
 tools/testing/selftests/bpf/trace_helpers.c | 104 ++++++++++++++++++++++++----
 3 files changed, 94 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index f45b06791444..83f390a31681 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -35,6 +35,8 @@ __weak void backtrace_symbols_fd(void *const *buffer, int size, int fd)
 	dprintf(fd, "<backtrace not supported>\n");
 }
 
+int env_verbosity = 0;
+
 static bool verbose(void)
 {
 	return env.verbosity > VERBOSE_NONE;
@@ -973,6 +975,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 				return -EINVAL;
 			}
 		}
+		env_verbosity = env->verbosity;
 
 		if (verbose()) {
 			if (setenv("SELFTESTS_VERBOSE", "1", 1) == -1) {
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 3ad131de14c6..7767d9a825ae 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -96,6 +96,8 @@ struct test_state {
 	FILE *stdout_saved;
 };
 
+extern int env_verbosity;
+
 struct test_env {
 	struct test_selector test_selector;
 	struct test_selector subtest_selector;
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 465d196c7165..1bfd881c0e07 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -10,6 +10,8 @@
 #include <pthread.h>
 #include <unistd.h>
 #include <linux/perf_event.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
 #include <sys/mman.h>
 #include "trace_helpers.h"
 #include <linux/limits.h>
@@ -244,29 +246,91 @@ out:
 	return err;
 }
 
+#ifdef PROCMAP_QUERY
+int env_verbosity __weak = 0;
+
+int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags)
+{
+	char path_buf[PATH_MAX], build_id_buf[20];
+	struct procmap_query q;
+	int err;
+
+	memset(&q, 0, sizeof(q));
+	q.size = sizeof(q);
+	q.query_flags = query_flags;
+	q.query_addr = (__u64)addr;
+	q.vma_name_addr = (__u64)path_buf;
+	q.vma_name_size = sizeof(path_buf);
+	q.build_id_addr = (__u64)build_id_buf;
+	q.build_id_size = sizeof(build_id_buf);
+
+	err = ioctl(fd, PROCMAP_QUERY, &q);
+	if (err < 0) {
+		err = -errno;
+		if (err == -ENOTTY)
+			return -EOPNOTSUPP; /* ioctl() not implemented yet */
+		if (err == -ENOENT)
+			return -ESRCH; /* vma not found */
+		return err;
+	}
+
+	if (env_verbosity >= 1) {
+		printf("VMA FOUND (addr %08lx): %08lx-%08lx %c%c%c%c %08lx %02x:%02x %ld %s (build ID: %s, %d bytes)\n",
+		       (long)addr, (long)q.vma_start, (long)q.vma_end,
+		       (q.vma_flags & PROCMAP_QUERY_VMA_READABLE) ? 'r' : '-',
+		       (q.vma_flags & PROCMAP_QUERY_VMA_WRITABLE) ? 'w' : '-',
+		       (q.vma_flags & PROCMAP_QUERY_VMA_EXECUTABLE) ? 'x' : '-',
+		       (q.vma_flags & PROCMAP_QUERY_VMA_SHARED) ? 's' : 'p',
+		       (long)q.vma_offset, q.dev_major, q.dev_minor, (long)q.inode,
+		       q.vma_name_size ? path_buf : "",
+		       q.build_id_size ? "YES" : "NO",
+		       q.build_id_size);
+	}
+
+	*start = q.vma_start;
+	*offset = q.vma_offset;
+	*flags = q.vma_flags;
+	return 0;
+}
+#else
+int procmap_query(int fd, const void *addr, size_t *start, size_t *offset, int *flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 ssize_t get_uprobe_offset(const void *addr)
 {
-	size_t start, end, base;
-	char buf[256];
-	bool found = false;
+	size_t start, base, end;
 	FILE *f;
+	char buf[256];
+	int err, flags;
 
 	f = fopen("/proc/self/maps", "r");
 	if (!f)
 		return -errno;
 
-	while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) {
-		if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) {
-			found = true;
-			break;
+	/* requested executable VMA only */
+	err = procmap_query(fileno(f), addr, PROCMAP_QUERY_VMA_EXECUTABLE, &start, &base, &flags);
+	if (err == -EOPNOTSUPP) {
+		bool found = false;
+
+		while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) {
+			if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) {
+				found = true;
+				break;
+			}
+		}
+		if (!found) {
+			fclose(f);
+			return -ESRCH;
 		}
+	} else if (err) {
+		fclose(f);
+		return err;
 	}
-
 	fclose(f);
 
-	if (!found)
-		return -ESRCH;
-
 #if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2
 
 #define OP_RT_RA_MASK   0xffff0000UL
@@ -307,15 +371,25 @@ ssize_t get_rel_offset(uintptr_t addr)
 	size_t start, end, offset;
 	char buf[256];
 	FILE *f;
+	int err, flags;
 
 	f = fopen("/proc/self/maps", "r");
 	if (!f)
 		return -errno;
 
-	while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) {
-		if (addr >= start && addr < end) {
-			fclose(f);
-			return (size_t)addr - start + offset;
+	err = procmap_query(fileno(f), (const void *)addr, 0, &start, &offset, &flags);
+	if (err == 0) {
+		fclose(f);
+		return (size_t)addr - start + offset;
+	} else if (err != -EOPNOTSUPP) {
+		fclose(f);
+		return err;
+	} else if (err) {
+		while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) {
+			if (addr >= start && addr < end) {
+				fclose(f);
+				return (size_t)addr - start + offset;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From f727b13dbea16c5e117e263aa8aea59d632d5660 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 5 Aug 2024 21:29:35 -0700
Subject: selftests/bpf: add multi-uprobe benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add multi-uprobe and multi-uretprobe benchmarks to bench tool.
Multi- and classic uprobes/uretprobes have different low-level
triggering code paths, so it's sometimes important to be able to
benchmark both flavors of uprobes/uretprobes.

Sample examples from my dev machine below. Single-threaded peformance
almost doesn't differ, but with more parallel CPUs triggering the same
uprobe/uretprobe the difference grows. This might be due to [0], but
given the code is slightly different, there could be other sources of
slowdown.

Note, all these numbers will change due to ongoing work to improve
uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this
is useful for measurements and debugging nevertheless.

\#!/bin/bash
set -eufo pipefail
for p in 1 8 16 32; do
    for i in uprobe-nop uretprobe-nop uprobe-multi-nop uretprobe-multi-nop; do
        summary=$(sudo ./bench -w1 -d3 -p$p -a trig-$i | tail -n1)
        total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-)
        percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut -d'/' -f1)
        printf "%-21s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu"
    done
    echo
done

uprobe-nop            ( 1 cpus):    1.020 ± 0.005M/s  (  1.020M/s/cpu)
uretprobe-nop         ( 1 cpus):    0.515 ± 0.009M/s  (  0.515M/s/cpu)
uprobe-multi-nop      ( 1 cpus):    1.036 ± 0.004M/s  (  1.036M/s/cpu)
uretprobe-multi-nop   ( 1 cpus):    0.512 ± 0.005M/s  (  0.512M/s/cpu)

uprobe-nop            ( 8 cpus):    3.481 ± 0.030M/s  (  0.435M/s/cpu)
uretprobe-nop         ( 8 cpus):    2.222 ± 0.008M/s  (  0.278M/s/cpu)
uprobe-multi-nop      ( 8 cpus):    3.769 ± 0.094M/s  (  0.471M/s/cpu)
uretprobe-multi-nop   ( 8 cpus):    2.482 ± 0.007M/s  (  0.310M/s/cpu)

uprobe-nop            (16 cpus):    2.968 ± 0.011M/s  (  0.185M/s/cpu)
uretprobe-nop         (16 cpus):    1.870 ± 0.002M/s  (  0.117M/s/cpu)
uprobe-multi-nop      (16 cpus):    3.541 ± 0.037M/s  (  0.221M/s/cpu)
uretprobe-multi-nop   (16 cpus):    2.123 ± 0.026M/s  (  0.133M/s/cpu)

uprobe-nop            (32 cpus):    2.524 ± 0.026M/s  (  0.079M/s/cpu)
uretprobe-nop         (32 cpus):    1.572 ± 0.003M/s  (  0.049M/s/cpu)
uprobe-multi-nop      (32 cpus):    2.717 ± 0.003M/s  (  0.085M/s/cpu)
uretprobe-multi-nop   (32 cpus):    1.687 ± 0.007M/s  (  0.053M/s/cpu)

  [0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/
  [1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240806042935.3867862-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/bench.c                | 12 ++++
 tools/testing/selftests/bpf/benchs/bench_trigger.c | 81 ++++++++++++++++++----
 tools/testing/selftests/bpf/progs/trigger_bench.c  |  7 ++
 3 files changed, 85 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 90dc3aca32bd..1bd403a5ef7b 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -520,6 +520,12 @@ extern const struct bench bench_trig_uprobe_push;
 extern const struct bench bench_trig_uretprobe_push;
 extern const struct bench bench_trig_uprobe_ret;
 extern const struct bench bench_trig_uretprobe_ret;
+extern const struct bench bench_trig_uprobe_multi_nop;
+extern const struct bench bench_trig_uretprobe_multi_nop;
+extern const struct bench bench_trig_uprobe_multi_push;
+extern const struct bench bench_trig_uretprobe_multi_push;
+extern const struct bench bench_trig_uprobe_multi_ret;
+extern const struct bench bench_trig_uretprobe_multi_ret;
 
 extern const struct bench bench_rb_libbpf;
 extern const struct bench bench_rb_custom;
@@ -574,6 +580,12 @@ static const struct bench *benchs[] = {
 	&bench_trig_uretprobe_push,
 	&bench_trig_uprobe_ret,
 	&bench_trig_uretprobe_ret,
+	&bench_trig_uprobe_multi_nop,
+	&bench_trig_uretprobe_multi_nop,
+	&bench_trig_uprobe_multi_push,
+	&bench_trig_uretprobe_multi_push,
+	&bench_trig_uprobe_multi_ret,
+	&bench_trig_uretprobe_multi_ret,
 	/* ringbuf/perfbuf benchmarks */
 	&bench_rb_libbpf,
 	&bench_rb_custom,
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 4b05539f167d..a220545a3238 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -332,7 +332,7 @@ static void *uprobe_producer_ret(void *input)
 	return NULL;
 }
 
-static void usetup(bool use_retprobe, void *target_addr)
+static void usetup(bool use_retprobe, bool use_multi, void *target_addr)
 {
 	size_t uprobe_offset;
 	struct bpf_link *link;
@@ -346,7 +346,10 @@ static void usetup(bool use_retprobe, void *target_addr)
 		exit(1);
 	}
 
-	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true);
+	if (use_multi)
+		bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe_multi, true);
+	else
+		bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true);
 
 	err = trigger_bench__load(ctx.skel);
 	if (err) {
@@ -355,16 +358,28 @@ static void usetup(bool use_retprobe, void *target_addr)
 	}
 
 	uprobe_offset = get_uprobe_offset(target_addr);
-	link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
-					  use_retprobe,
-					  -1 /* all PIDs */,
-					  "/proc/self/exe",
-					  uprobe_offset);
+	if (use_multi) {
+		LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
+			.retprobe = use_retprobe,
+			.cnt = 1,
+			.offsets = &uprobe_offset,
+		);
+		link = bpf_program__attach_uprobe_multi(
+			ctx.skel->progs.bench_trigger_uprobe_multi,
+			-1 /* all PIDs */, "/proc/self/exe", NULL, &opts);
+		ctx.skel->links.bench_trigger_uprobe_multi = link;
+	} else {
+		link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
+						  use_retprobe,
+						  -1 /* all PIDs */,
+						  "/proc/self/exe",
+						  uprobe_offset);
+		ctx.skel->links.bench_trigger_uprobe = link;
+	}
 	if (!link) {
-		fprintf(stderr, "failed to attach uprobe!\n");
+		fprintf(stderr, "failed to attach %s!\n", use_multi ? "multi-uprobe" : "uprobe");
 		exit(1);
 	}
-	ctx.skel->links.bench_trigger_uprobe = link;
 }
 
 static void usermode_count_setup(void)
@@ -374,32 +389,62 @@ static void usermode_count_setup(void)
 
 static void uprobe_nop_setup(void)
 {
-	usetup(false, &uprobe_target_nop);
+	usetup(false, false /* !use_multi */, &uprobe_target_nop);
 }
 
 static void uretprobe_nop_setup(void)
 {
-	usetup(true, &uprobe_target_nop);
+	usetup(true, false /* !use_multi */, &uprobe_target_nop);
 }
 
 static void uprobe_push_setup(void)
 {
-	usetup(false, &uprobe_target_push);
+	usetup(false, false /* !use_multi */, &uprobe_target_push);
 }
 
 static void uretprobe_push_setup(void)
 {
-	usetup(true, &uprobe_target_push);
+	usetup(true, false /* !use_multi */, &uprobe_target_push);
 }
 
 static void uprobe_ret_setup(void)
 {
-	usetup(false, &uprobe_target_ret);
+	usetup(false, false /* !use_multi */, &uprobe_target_ret);
 }
 
 static void uretprobe_ret_setup(void)
 {
-	usetup(true, &uprobe_target_ret);
+	usetup(true, false /* !use_multi */, &uprobe_target_ret);
+}
+
+static void uprobe_multi_nop_setup(void)
+{
+	usetup(false, true /* use_multi */, &uprobe_target_nop);
+}
+
+static void uretprobe_multi_nop_setup(void)
+{
+	usetup(true, true /* use_multi */, &uprobe_target_nop);
+}
+
+static void uprobe_multi_push_setup(void)
+{
+	usetup(false, true /* use_multi */, &uprobe_target_push);
+}
+
+static void uretprobe_multi_push_setup(void)
+{
+	usetup(true, true /* use_multi */, &uprobe_target_push);
+}
+
+static void uprobe_multi_ret_setup(void)
+{
+	usetup(false, true /* use_multi */, &uprobe_target_ret);
+}
+
+static void uretprobe_multi_ret_setup(void)
+{
+	usetup(true, true /* use_multi */, &uprobe_target_ret);
 }
 
 const struct bench bench_trig_syscall_count = {
@@ -454,3 +499,9 @@ BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret");
 BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop");
 BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push");
 BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret");
+BENCH_TRIG_USERMODE(uprobe_multi_nop, nop, "uprobe-multi-nop");
+BENCH_TRIG_USERMODE(uprobe_multi_push, push, "uprobe-multi-push");
+BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret");
+BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop");
+BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push");
+BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret");
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 2619ed193c65..044a6d78923e 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -32,6 +32,13 @@ int bench_trigger_uprobe(void *ctx)
 	return 0;
 }
 
+SEC("?uprobe.multi")
+int bench_trigger_uprobe_multi(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
 const volatile int batch_iters = 0;
 
 SEC("?raw_tp")
-- 
cgit v1.2.3


From 91c96842ab1e9159c8129ab5ddfeb7dd97bf840e Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 13 Aug 2024 21:24:24 +0000
Subject: selftests/bpf: Test bpf_kptr_xchg stashing into local kptr

Test stashing both referenced kptr and local kptr into local kptrs. Then,
test unstashing them.

Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Amery Hung <amery.hung@bytedance.com>
Link: https://lore.kernel.org/r/20240813212424.2871455-6-amery.hung@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/local_kptr_stash.c | 30 ++++++++++++++++++++--
 .../selftests/bpf/progs/task_kfunc_success.c       | 26 ++++++++++++++++++-
 2 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/local_kptr_stash.c b/tools/testing/selftests/bpf/progs/local_kptr_stash.c
index 75043ffc5dad..b092a72b2c9d 100644
--- a/tools/testing/selftests/bpf/progs/local_kptr_stash.c
+++ b/tools/testing/selftests/bpf/progs/local_kptr_stash.c
@@ -8,9 +8,12 @@
 #include "../bpf_experimental.h"
 #include "../bpf_testmod/bpf_testmod_kfunc.h"
 
+struct plain_local;
+
 struct node_data {
 	long key;
 	long data;
+	struct plain_local __kptr * stashed_in_local_kptr;
 	struct bpf_rb_node node;
 };
 
@@ -85,6 +88,7 @@ static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
 
 static int create_and_stash(int idx, int val)
 {
+	struct plain_local *inner_local_kptr;
 	struct map_value *mapval;
 	struct node_data *res;
 
@@ -92,11 +96,25 @@ static int create_and_stash(int idx, int val)
 	if (!mapval)
 		return 1;
 
+	inner_local_kptr = bpf_obj_new(typeof(*inner_local_kptr));
+	if (!inner_local_kptr)
+		return 2;
+
 	res = bpf_obj_new(typeof(*res));
-	if (!res)
-		return 1;
+	if (!res) {
+		bpf_obj_drop(inner_local_kptr);
+		return 3;
+	}
 	res->key = val;
 
+	inner_local_kptr = bpf_kptr_xchg(&res->stashed_in_local_kptr, inner_local_kptr);
+	if (inner_local_kptr) {
+		/* Should never happen, we just obj_new'd res */
+		bpf_obj_drop(inner_local_kptr);
+		bpf_obj_drop(res);
+		return 4;
+	}
+
 	res = bpf_kptr_xchg(&mapval->node, res);
 	if (res)
 		bpf_obj_drop(res);
@@ -169,6 +187,7 @@ long stash_local_with_root(void *ctx)
 SEC("tc")
 long unstash_rb_node(void *ctx)
 {
+	struct plain_local *inner_local_kptr = NULL;
 	struct map_value *mapval;
 	struct node_data *res;
 	long retval;
@@ -180,6 +199,13 @@ long unstash_rb_node(void *ctx)
 
 	res = bpf_kptr_xchg(&mapval->node, NULL);
 	if (res) {
+		inner_local_kptr = bpf_kptr_xchg(&res->stashed_in_local_kptr, inner_local_kptr);
+		if (!inner_local_kptr) {
+			bpf_obj_drop(res);
+			return 1;
+		}
+		bpf_obj_drop(inner_local_kptr);
+
 		retval = res->key;
 		bpf_obj_drop(res);
 		return retval;
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
index 70df695312dc..3138bb689b0b 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
@@ -5,6 +5,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 
+#include "../bpf_experimental.h"
 #include "task_kfunc_common.h"
 
 char _license[] SEC("license") = "GPL";
@@ -143,7 +144,7 @@ SEC("tp_btf/task_newtask")
 int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *kptr;
-	struct __tasks_kfunc_map_value *v;
+	struct __tasks_kfunc_map_value *v, *local;
 	long status;
 
 	if (!is_test_kfunc_task())
@@ -167,6 +168,29 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
 		return 0;
 	}
 
+	local = bpf_obj_new(typeof(*local));
+	if (!local) {
+		err = 4;
+		bpf_task_release(kptr);
+		return 0;
+	}
+
+	kptr = bpf_kptr_xchg(&local->task, kptr);
+	if (kptr) {
+		err = 5;
+		bpf_obj_drop(local);
+		bpf_task_release(kptr);
+		return 0;
+	}
+
+	kptr = bpf_kptr_xchg(&local->task, NULL);
+	if (!kptr) {
+		err = 6;
+		bpf_obj_drop(local);
+		return 0;
+	}
+
+	bpf_obj_drop(local);
 	bpf_task_release(kptr);
 
 	return 0;
-- 
cgit v1.2.3


From 5772c3458bb8d17d763e0f411e1bae1bf4eda88d Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Fri, 23 Aug 2024 12:44:09 -0700
Subject: selftests/bpf: use simply-expanded variables for libpcap flags

Save pkg-config output for libpcap as simply-expanded variables.
For an obscure reason 'shell' call in LDLIBS/CFLAGS recursively
expanded variables makes *.test.o files compilation non-parallel
when make is executed with -j option.

While at it, reuse 'pkg-config --cflags' call to define
-DTRAFFIC_MONITOR=1 option, it's exit status is the same as for
'pkg-config --exists'.

Fixes: f52403b6bfea ("selftests/bpf: Add traffic monitor functions.")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240823194409.774815-1-eddyz87@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index ec7d425c4022..c120617b64ad 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -48,9 +48,10 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic					\
 LDFLAGS += $(SAN_LDFLAGS)
 LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread
 
-LDLIBS += $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null)
-CFLAGS += $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null)
-CFLAGS += $(shell $(PKG_CONFIG) --exists libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1")
+PCAP_CFLAGS	:= $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1")
+PCAP_LIBS	:= $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null)
+LDLIBS += $(PCAP_LIBS)
+CFLAGS += $(PCAP_CFLAGS)
 
 # The following tests perform type punning and they may break strict
 # aliasing rules, which are exploited by both GCC and clang by default
-- 
cgit v1.2.3


From 65ab5ac4df012388481d0414fcac1d5ac1721fb3 Mon Sep 17 00:00:00 2001
From: Jordan Rome <linux@jordanrome.com>
Date: Fri, 23 Aug 2024 12:51:00 -0700
Subject: bpf: Add bpf_copy_from_user_str kfunc

This adds a kfunc wrapper around strncpy_from_user,
which can be called from sleepable BPF programs.

This matches the non-sleepable 'bpf_probe_read_user_str'
helper except it includes an additional 'flags'
param, which allows consumers to clear the entire
destination buffer on success or failure.

Signed-off-by: Jordan Rome <linux@jordanrome.com>
Link: https://lore.kernel.org/r/20240823195101.3621028-1-linux@jordanrome.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       |  9 +++++++++
 kernel/bpf/helpers.c           | 42 ++++++++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  9 +++++++++
 3 files changed, 60 insertions(+)

(limited to 'tools')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e2629457d72d..c3a5728db115 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7513,4 +7513,13 @@ struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+/*
+ * Flags to control BPF kfunc behaviour.
+ *     - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective
+ *       helper documentation for details.)
+ */
+enum bpf_kfunc_flags {
+	BPF_F_PAD_ZEROS = (1ULL << 0),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f12f075b70c5..7e7761c537f8 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2962,6 +2962,47 @@ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
 	bpf_mem_free(&bpf_global_ma, kit->bits);
 }
 
+/**
+ * bpf_copy_from_user_str() - Copy a string from an unsafe user address
+ * @dst:             Destination address, in kernel space.  This buffer must be
+ *                   at least @dst__sz bytes long.
+ * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address, in user space.
+ * @flags:           The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL-terminated string from userspace to BPF space. If user string is
+ * too long this will still ensure zero termination in the dst buffer unless
+ * buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
+ * memset all of @dst on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
+{
+	int ret;
+
+	if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+		return -EINVAL;
+
+	if (unlikely(!dst__sz))
+		return 0;
+
+	ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
+	if (ret < 0) {
+		if (flags & BPF_F_PAD_ZEROS)
+			memset((char *)dst, 0, dst__sz);
+
+		return ret;
+	}
+
+	if (flags & BPF_F_PAD_ZEROS)
+		memset((char *)dst + ret, 0, dst__sz - ret);
+	else
+		((char *)dst)[ret] = '\0';
+
+	return ret + 1;
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -3047,6 +3088,7 @@ BTF_ID_FLAGS(func, bpf_preempt_enable)
 BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 35bcf52dbc65..f329ee44627a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7512,4 +7512,13 @@ struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+/*
+ * Flags to control BPF kfunc behaviour.
+ *     - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective
+ *       helper documentation for details.)
+ */
+enum bpf_kfunc_flags {
+	BPF_F_PAD_ZEROS = (1ULL << 0),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From ddc3d98807dca05ad10f4c76d13019c428302719 Mon Sep 17 00:00:00 2001
From: Jordan Rome <linux@jordanrome.com>
Date: Fri, 23 Aug 2024 12:51:01 -0700
Subject: selftests/bpf: Add tests for bpf_copy_from_user_str kfunc.

This adds tests for both the happy path and
the error path.

Signed-off-by: Jordan Rome <linux@jordanrome.com>
Link: https://lore.kernel.org/r/20240823195101.3621028-2-linux@jordanrome.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/attach_probe.c        |  8 ++-
 .../selftests/bpf/prog_tests/read_vsyscall.c       |  1 +
 tools/testing/selftests/bpf/progs/read_vsyscall.c  |  9 ++-
 .../selftests/bpf/progs/test_attach_probe.c        | 64 +++++++++++++++++++++-
 4 files changed, 75 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
index 7175af39134f..329c7862b52d 100644
--- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c
+++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
@@ -283,9 +283,11 @@ static void test_uprobe_sleepable(struct test_attach_probe *skel)
 	trigger_func3();
 
 	ASSERT_EQ(skel->bss->uprobe_byname3_sleepable_res, 9, "check_uprobe_byname3_sleepable_res");
-	ASSERT_EQ(skel->bss->uprobe_byname3_res, 10, "check_uprobe_byname3_res");
-	ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 11, "check_uretprobe_byname3_sleepable_res");
-	ASSERT_EQ(skel->bss->uretprobe_byname3_res, 12, "check_uretprobe_byname3_res");
+	ASSERT_EQ(skel->bss->uprobe_byname3_str_sleepable_res, 10, "check_uprobe_byname3_str_sleepable_res");
+	ASSERT_EQ(skel->bss->uprobe_byname3_res, 11, "check_uprobe_byname3_res");
+	ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 12, "check_uretprobe_byname3_sleepable_res");
+	ASSERT_EQ(skel->bss->uretprobe_byname3_str_sleepable_res, 13, "check_uretprobe_byname3_str_sleepable_res");
+	ASSERT_EQ(skel->bss->uretprobe_byname3_res, 14, "check_uretprobe_byname3_res");
 }
 
 void test_attach_probe(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c
index 3405923fe4e6..c7b9ba8b1d06 100644
--- a/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c
@@ -23,6 +23,7 @@ struct read_ret_desc {
 	{ .name = "probe_read_user_str", .ret = -EFAULT },
 	{ .name = "copy_from_user", .ret = -EFAULT },
 	{ .name = "copy_from_user_task", .ret = -EFAULT },
+	{ .name = "copy_from_user_str", .ret = -EFAULT },
 };
 
 void test_read_vsyscall(void)
diff --git a/tools/testing/selftests/bpf/progs/read_vsyscall.c b/tools/testing/selftests/bpf/progs/read_vsyscall.c
index 986f96687ae1..39ebef430059 100644
--- a/tools/testing/selftests/bpf/progs/read_vsyscall.c
+++ b/tools/testing/selftests/bpf/progs/read_vsyscall.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2024. Huawei Technologies Co., Ltd */
+#include "vmlinux.h"
 #include <linux/types.h>
 #include <bpf/bpf_helpers.h>
 
@@ -7,10 +8,15 @@
 
 int target_pid = 0;
 void *user_ptr = 0;
-int read_ret[8];
+int read_ret[9];
 
 char _license[] SEC("license") = "GPL";
 
+/*
+ * This is the only kfunc, the others are helpers
+ */
+int bpf_copy_from_user_str(void *dst, u32, const void *, u64) __weak __ksym;
+
 SEC("fentry/" SYS_PREFIX "sys_nanosleep")
 int do_probe_read(void *ctx)
 {
@@ -40,6 +46,7 @@ int do_copy_from_user(void *ctx)
 	read_ret[6] = bpf_copy_from_user(buf, sizeof(buf), user_ptr);
 	read_ret[7] = bpf_copy_from_user_task(buf, sizeof(buf), user_ptr,
 					      bpf_get_current_task_btf(), 0);
+	read_ret[8] = bpf_copy_from_user_str((char *)buf, sizeof(buf), user_ptr, 0);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c
index 68466a6ad18c..fb79e6cab932 100644
--- a/tools/testing/selftests/bpf/progs/test_attach_probe.c
+++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c
@@ -5,8 +5,10 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
+#include <errno.h>
 #include "bpf_misc.h"
 
+u32 dynamic_sz = 1;
 int kprobe2_res = 0;
 int kretprobe2_res = 0;
 int uprobe_byname_res = 0;
@@ -14,11 +16,15 @@ int uretprobe_byname_res = 0;
 int uprobe_byname2_res = 0;
 int uretprobe_byname2_res = 0;
 int uprobe_byname3_sleepable_res = 0;
+int uprobe_byname3_str_sleepable_res = 0;
 int uprobe_byname3_res = 0;
 int uretprobe_byname3_sleepable_res = 0;
+int uretprobe_byname3_str_sleepable_res = 0;
 int uretprobe_byname3_res = 0;
 void *user_ptr = 0;
 
+int bpf_copy_from_user_str(void *dst, u32, const void *, u64) __weak __ksym;
+
 SEC("ksyscall/nanosleep")
 int BPF_KSYSCALL(handle_kprobe_auto, struct __kernel_timespec *req, struct __kernel_timespec *rem)
 {
@@ -87,11 +93,61 @@ static __always_inline bool verify_sleepable_user_copy(void)
 	return bpf_strncmp(data, sizeof(data), "test_data") == 0;
 }
 
+static __always_inline bool verify_sleepable_user_copy_str(void)
+{
+	int ret;
+	char data_long[20];
+	char data_long_pad[20];
+	char data_long_err[20];
+	char data_short[4];
+	char data_short_pad[4];
+
+	ret = bpf_copy_from_user_str(data_short, sizeof(data_short), user_ptr, 0);
+
+	if (bpf_strncmp(data_short, 4, "tes\0") != 0 || ret != 4)
+		return false;
+
+	ret = bpf_copy_from_user_str(data_short_pad, sizeof(data_short_pad), user_ptr, BPF_F_PAD_ZEROS);
+
+	if (bpf_strncmp(data_short, 4, "tes\0") != 0 || ret != 4)
+		return false;
+
+	/* Make sure this passes the verifier */
+	ret = bpf_copy_from_user_str(data_long, dynamic_sz & sizeof(data_long), user_ptr, 0);
+
+	if (ret != 0)
+		return false;
+
+	ret = bpf_copy_from_user_str(data_long, sizeof(data_long), user_ptr, 0);
+
+	if (bpf_strncmp(data_long, 10, "test_data\0") != 0 || ret != 10)
+		return false;
+
+	ret = bpf_copy_from_user_str(data_long_pad, sizeof(data_long_pad), user_ptr, BPF_F_PAD_ZEROS);
+
+	if (bpf_strncmp(data_long_pad, 10, "test_data\0") != 0 || ret != 10 || data_long_pad[19] != '\0')
+		return false;
+
+	ret = bpf_copy_from_user_str(data_long_err, sizeof(data_long_err), (void *)data_long, BPF_F_PAD_ZEROS);
+
+	if (ret > 0 || data_long_err[19] != '\0')
+		return false;
+
+	ret = bpf_copy_from_user_str(data_long, sizeof(data_long), user_ptr, 2);
+
+	if (ret != -EINVAL)
+		return false;
+
+	return true;
+}
+
 SEC("uprobe.s//proc/self/exe:trigger_func3")
 int handle_uprobe_byname3_sleepable(struct pt_regs *ctx)
 {
 	if (verify_sleepable_user_copy())
 		uprobe_byname3_sleepable_res = 9;
+	if (verify_sleepable_user_copy_str())
+		uprobe_byname3_str_sleepable_res = 10;
 	return 0;
 }
 
@@ -102,7 +158,7 @@ int handle_uprobe_byname3_sleepable(struct pt_regs *ctx)
 SEC("uprobe//proc/self/exe:trigger_func3")
 int handle_uprobe_byname3(struct pt_regs *ctx)
 {
-	uprobe_byname3_res = 10;
+	uprobe_byname3_res = 11;
 	return 0;
 }
 
@@ -110,14 +166,16 @@ SEC("uretprobe.s//proc/self/exe:trigger_func3")
 int handle_uretprobe_byname3_sleepable(struct pt_regs *ctx)
 {
 	if (verify_sleepable_user_copy())
-		uretprobe_byname3_sleepable_res = 11;
+		uretprobe_byname3_sleepable_res = 12;
+	if (verify_sleepable_user_copy_str())
+		uretprobe_byname3_str_sleepable_res = 13;
 	return 0;
 }
 
 SEC("uretprobe//proc/self/exe:trigger_func3")
 int handle_uretprobe_byname3(struct pt_regs *ctx)
 {
-	uretprobe_byname3_res = 12;
+	uretprobe_byname3_res = 14;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9c5b6d4e33dd78a0511ec34756d783b7e36028c2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 22 Aug 2024 15:04:29 +0200
Subject: selftests: add xfrm policy insertion speed test script

Nothing special, just test how long insertion of x policies takes.
This should ideally show linear insertion speeds.

Do not run this by default, it has little value, but it can be useful to
check for insertion speed chahnges when altering the xfrm policy db
implementation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 tools/testing/selftests/net/Makefile               |  2 +-
 .../testing/selftests/net/xfrm_policy_add_speed.sh | 83 ++++++++++++++++++++++
 2 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/xfrm_policy_add_speed.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 8eaffd7a641c..e127a80ff713 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -56,7 +56,7 @@ TEST_PROGS += ip_local_port_range.sh
 TEST_PROGS += rps_default_mask.sh
 TEST_PROGS += big_tcp.sh
 TEST_PROGS += netns-sysctl.sh
-TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh
+TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh xfrm_policy_add_speed.sh
 TEST_GEN_FILES =  socket nettest
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
 TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite
diff --git a/tools/testing/selftests/net/xfrm_policy_add_speed.sh b/tools/testing/selftests/net/xfrm_policy_add_speed.sh
new file mode 100755
index 000000000000..2fab29d3cb91
--- /dev/null
+++ b/tools/testing/selftests/net/xfrm_policy_add_speed.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+source lib.sh
+
+timeout=4m
+ret=0
+tmp=$(mktemp)
+cleanup() {
+	cleanup_all_ns
+	rm -f "$tmp"
+}
+
+trap cleanup EXIT
+
+maxpolicies=100000
+[ "$KSFT_MACHINE_SLOW" = "yes" ] && maxpolicies=10000
+
+do_dummies4() {
+	local dir="$1"
+	local max="$2"
+
+	local policies
+	local pfx
+	pfx=30
+	policies=0
+
+	ip netns exec "$ns" ip xfrm policy flush
+
+	for i in $(seq 1 100);do
+		local s
+		local d
+		for j in $(seq 1 255);do
+			s=$((i+0))
+			d=$((i+100))
+
+			for a in $(seq 1 8 255); do
+				policies=$((policies+1))
+				[ "$policies" -gt "$max" ] && return
+				echo xfrm policy add src 10.$s.$j.0/30 dst 10.$d.$j.$a/$pfx dir $dir action block
+			done
+			for a in $(seq 1 8 255); do
+				policies=$((policies+1))
+				[ "$policies" -gt "$max" ] && return
+				echo xfrm policy add src 10.$s.$j.$a/30 dst 10.$d.$j.0/$pfx dir $dir action block
+			done
+		done
+	done
+}
+
+setup_ns ns
+
+do_bench()
+{
+	local max="$1"
+
+	start=$(date +%s%3N)
+	do_dummies4 "out" "$max" > "$tmp"
+	if ! timeout "$timeout" ip netns exec "$ns" ip -batch "$tmp";then
+		echo "WARNING: policy insertion cancelled after $timeout"
+		ret=1
+	fi
+	stop=$(date +%s%3N)
+
+	result=$((stop-start))
+
+	policies=$(wc -l < "$tmp")
+	printf "Inserted %-06s policies in $result ms\n" $policies
+
+	have=$(ip netns exec "$ns" ip xfrm policy show | grep "action block" | wc -l)
+	if [ "$have" -ne "$policies" ]; then
+		echo "WARNING: mismatch, have $have policies, expected $policies"
+		ret=1
+	fi
+}
+
+p=100
+while [ $p -le "$maxpolicies" ]; do
+	do_bench "$p"
+	p="${p}0"
+done
+
+exit $ret
-- 
cgit v1.2.3


From f133c76409c81653cb580903da49aecefca67013 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 13 Aug 2024 14:36:49 -0700
Subject: perf test: Support external tests for separate objdir

Extend the searching for the test files so that it works when running
perf from a separate objdir, and also when the perf executable is
symlinked.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240813213651.1057362-2-ak@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/tests-scripts.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/tests-scripts.c b/tools/perf/tests/tests-scripts.c
index e2042b368269..8dc1e398288c 100644
--- a/tools/perf/tests/tests-scripts.c
+++ b/tools/perf/tests/tests-scripts.c
@@ -29,16 +29,45 @@
 
 static int shell_tests__dir_fd(void)
 {
-	char path[PATH_MAX], *exec_path;
-	static const char * const devel_dirs[] = { "./tools/perf/tests/shell", "./tests/shell", };
+	struct stat st;
+	char path[PATH_MAX], path2[PATH_MAX], *exec_path;
+	static const char * const devel_dirs[] = {
+		"./tools/perf/tests/shell",
+		"./tests/shell",
+		"./source/tests/shell"
+	};
+	int fd;
+	char *p;
 
 	for (size_t i = 0; i < ARRAY_SIZE(devel_dirs); ++i) {
-		int fd = open(devel_dirs[i], O_PATH);
+		fd = open(devel_dirs[i], O_PATH);
 
 		if (fd >= 0)
 			return fd;
 	}
 
+	/* Use directory of executable */
+	if (readlink("/proc/self/exe", path2, sizeof path2) < 0)
+		return -1;
+	/* Follow another level of symlink if there */
+	if (lstat(path2, &st) == 0 && (st.st_mode & S_IFMT) == S_IFLNK) {
+		scnprintf(path, sizeof(path), path2);
+		if (readlink(path, path2, sizeof path2) < 0)
+			return -1;
+	}
+	/* Get directory */
+	p = strrchr(path2, '/');
+	if (p)
+		*p = 0;
+	scnprintf(path, sizeof(path), "%s/tests/shell", path2);
+	fd = open(path, O_PATH);
+	if (fd >= 0)
+		return fd;
+	scnprintf(path, sizeof(path), "%s/source/tests/shell", path2);
+	fd = open(path, O_PATH);
+	if (fd >= 0)
+		return fd;
+
 	/* Then installed path. */
 	exec_path = get_argv_exec_path();
 	scnprintf(path, sizeof(path), "%s/tests/shell", exec_path);
-- 
cgit v1.2.3


From b494b1673889c95ead43fb6978687fb61bb042d5 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 22 Aug 2024 02:56:39 -0700
Subject: net: netconsole: selftests: Create a new netconsole selftest

Adds a selftest that creates two virtual interfaces, assigns one to a
new namespace, and assigns IP addresses to both.

It listens on the destination interface using socat and configures a
dynamic target on netconsole, pointing to the destination IP address.

The test then checks if the message was received properly on the
destination interface.

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240822095652.3806208-1-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                                        |   1 +
 tools/testing/selftests/drivers/net/Makefile       |   5 +-
 tools/testing/selftests/drivers/net/config         |   4 +
 .../testing/selftests/drivers/net/netcons_basic.sh | 234 +++++++++++++++++++++
 4 files changed, 243 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/drivers/net/netcons_basic.sh

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index e4fa9010fcb6..30a9b9450e11 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15775,6 +15775,7 @@ M:	Breno Leitao <leitao@debian.org>
 S:	Maintained
 F:	Documentation/networking/netconsole.rst
 F:	drivers/net/netconsole.c
+F:	tools/testing/selftests/drivers/net/netcons_basic.sh
 
 NETDEVSIM
 M:	Jakub Kicinski <kuba@kernel.org>
diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index e54f382bcb02..39fb97a8c1df 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0
 
-TEST_INCLUDES := $(wildcard lib/py/*.py)
+TEST_INCLUDES := $(wildcard lib/py/*.py) \
+		 ../../net/net_helper.sh \
+		 ../../net/lib.sh \
 
 TEST_PROGS := \
+	netcons_basic.sh \
 	ping.py \
 	queues.py \
 	stats.py \
diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config
index f6a58ce8a230..a2d8af60876d 100644
--- a/tools/testing/selftests/drivers/net/config
+++ b/tools/testing/selftests/drivers/net/config
@@ -1,2 +1,6 @@
 CONFIG_IPV6=y
 CONFIG_NETDEVSIM=m
+CONFIG_CONFIGFS_FS=y
+CONFIG_NETCONSOLE=m
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_NETCONSOLE_EXTENDED_LOG=y
diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
new file mode 100755
index 000000000000..06021b2059b7
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -0,0 +1,234 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test creates two netdevsim virtual interfaces, assigns one of them (the
+# "destination interface") to a new namespace, and assigns IP addresses to both
+# interfaces.
+#
+# It listens on the destination interface using socat and configures a dynamic
+# target on netconsole, pointing to the destination IP address.
+#
+# Finally, it checks whether the message was received properly on the
+# destination interface.  Note that this test may pollute the kernel log buffer
+# (dmesg) and relies on dynamic configuration and namespaces being configured.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+# Simple script to test dynamic targets in netconsole
+SRCIF="" # to be populated later
+SRCIP=192.168.1.1
+DSTIF="" # to be populated later
+DSTIP=192.168.1.2
+
+PORT="6666"
+MSG="netconsole selftest"
+TARGET=$(mktemp -u netcons_XXXXX)
+DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk)
+NETCONS_CONFIGFS="/sys/kernel/config/netconsole"
+NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
+# NAMESPACE will be populated by setup_ns with a random value
+NAMESPACE=""
+
+# IDs for netdevsim
+NSIM_DEV_1_ID=$((256 + RANDOM % 256))
+NSIM_DEV_2_ID=$((512 + RANDOM % 256))
+
+# Used to create and delete namespaces
+source "${SCRIPTDIR}"/../../net/lib.sh
+source "${SCRIPTDIR}"/../../net/net_helper.sh
+
+# Create netdevsim interfaces
+create_ifaces() {
+	local NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
+
+	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
+	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
+	udevadm settle 2> /dev/null || true
+
+	local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID"
+	local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID"
+
+	# These are global variables
+	SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \
+		-path "$NSIM1"/net -exec basename {} \;)
+	DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \
+		-path "$NSIM2"/net -exec basename {} \;)
+}
+
+link_ifaces() {
+	local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device"
+	local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex)
+	local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex)
+
+	exec {NAMESPACE_FD}</var/run/netns/"${NAMESPACE}"
+	exec {INITNS_FD}</proc/self/ns/net
+
+	# Bind the dst interface to namespace
+	ip link set "${DSTIF}" netns "${NAMESPACE}"
+
+	# Linking one device to the other one (on the other namespace}
+	if ! echo "${INITNS_FD}:$SRCIF_IFIDX $NAMESPACE_FD:$DSTIF_IFIDX"  > $NSIM_DEV_SYS_LINK
+	then
+		echo "linking netdevsim1 with netdevsim2 should succeed"
+		cleanup
+		exit "${ksft_skip}"
+	fi
+}
+
+function configure_ip() {
+	# Configure the IPs for both interfaces
+	ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}"
+	ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up
+
+	ip addr add "${SRCIP}"/24 dev "${SRCIF}"
+	ip link set "${SRCIF}" up
+}
+
+function set_network() {
+	# setup_ns function is coming from lib.sh
+	setup_ns NAMESPACE
+
+	# Create both interfaces, and assign the destination to a different
+	# namespace
+	create_ifaces
+
+	# Link both interfaces back to back
+	link_ifaces
+
+	configure_ip
+}
+
+function create_dynamic_target() {
+	DSTMAC=$(ip netns exec "${NAMESPACE}" \
+		 ip link show "${DSTIF}" | awk '/ether/ {print $2}')
+
+	# Create a dynamic target
+	mkdir "${NETCONS_PATH}"
+
+	echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip
+	echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip
+	echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac
+	echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name
+
+	echo 1 > "${NETCONS_PATH}"/enabled
+}
+
+function cleanup() {
+	local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device"
+
+	# delete netconsole dynamic reconfiguration
+	echo 0 > "${NETCONS_PATH}"/enabled
+	# Remove the configfs entry
+	rmdir "${NETCONS_PATH}"
+
+	# Delete netdevsim devices
+	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL"
+	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL"
+
+	# this is coming from lib.sh
+	cleanup_all_ns
+
+	# Restoring printk configurations
+	echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk
+}
+
+function listen_port_and_save_to() {
+	local OUTPUT=${1}
+	# Just wait for 2 seconds
+	timeout 2 ip netns exec "${NAMESPACE}" \
+		socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}"
+}
+
+function validate_result() {
+	local TMPFILENAME="$1"
+
+	# Check if the file exists
+	if [ ! -f "$TMPFILENAME" ]; then
+		echo "FAIL: File was not generated." >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "${MSG}" "${TMPFILENAME}"; then
+		echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2
+		cat "${TMPFILENAME}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	# Delete the file once it is validated, otherwise keep it
+	# for debugging purposes
+	rm "${TMPFILENAME}"
+	exit "${ksft_pass}"
+}
+
+function check_for_dependencies() {
+	if [ "$(id -u)" -ne 0 ]; then
+		echo "This test must be run as root" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which socat > /dev/null ; then
+		echo "SKIP: socat(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which ip > /dev/null ; then
+		echo "SKIP: ip(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which udevadm > /dev/null ; then
+		echo "SKIP: udevadm(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if [ ! -d "${NETCONS_CONFIGFS}" ]; then
+		echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ip link show "${DSTIF}" 2> /dev/null; then
+		echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then
+		echo "SKIP: IPs already in use. Skipping it" >&2
+		exit "${ksft_skip}"
+	fi
+}
+
+# ========== #
+# Start here #
+# ========== #
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# The content of kmsg will be save to the following file
+OUTPUT_FILE="/tmp/${TARGET}"
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network
+# Create a dynamic target for netconsole
+create_dynamic_target
+# Listed for netconsole port inside the namespace and destination interface
+listen_port_and_save_to "${OUTPUT_FILE}" &
+# Wait for socat to start and listen to the port.
+wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
+# Send the message
+echo "${MSG}: ${TARGET}" > /dev/kmsg
+# Wait until socat saves the file to disk
+busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+
+# Make sure the message was received in the dst part
+# and exit
+validate_result "${OUTPUT_FILE}"
-- 
cgit v1.2.3


From eb2dcde9f970ed8d3669444d47c8524b4bdf7d32 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Fri, 28 Jun 2024 11:46:11 +0100
Subject: ring-buffer: Align meta-page to sub-buffers for improved TLB usage

Previously, the mapped ring-buffer layout caused misalignment between
the meta-page and sub-buffers when the sub-buffer size was not a
multiple of PAGE_SIZE. This prevented hardware with larger TLB entries
from utilizing them effectively.

Add a padding with the zero-page between the meta-page and sub-buffers.
Also update the ring-buffer map_test to verify that padding.

Link: https://lore.kernel.org/20240628104611.1443542-1-vdonnefort@google.com
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c                     | 33 ++++++++++++++++----------
 tools/testing/selftests/ring-buffer/map_test.c | 14 +++++++++++
 2 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c3a5e6cbb940..77dc0b25140e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6852,10 +6852,10 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
 	/* install subbuf ID to kern VA translation */
 	cpu_buffer->subbuf_ids = subbuf_ids;
 
-	meta->meta_page_size = PAGE_SIZE;
 	meta->meta_struct_len = sizeof(*meta);
 	meta->nr_subbufs = nr_subbufs;
 	meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+	meta->meta_page_size = meta->subbuf_size;
 
 	rb_update_meta_page(cpu_buffer);
 }
@@ -6949,6 +6949,12 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
 	    !(vma->vm_flags & VM_MAYSHARE))
 		return -EPERM;
 
+	subbuf_order = cpu_buffer->buffer->subbuf_order;
+	subbuf_pages = 1 << subbuf_order;
+
+	if (subbuf_order && pgoff % subbuf_pages)
+		return -EINVAL;
+
 	/*
 	 * Make sure the mapping cannot become writable later. Also tell the VM
 	 * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
@@ -6958,11 +6964,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
 
 	lockdep_assert_held(&cpu_buffer->mapping_lock);
 
-	subbuf_order = cpu_buffer->buffer->subbuf_order;
-	subbuf_pages = 1 << subbuf_order;
-
 	nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
-	nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
+	nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */
 
 	nr_vma_pages = vma_pages(vma);
 	if (!nr_vma_pages || nr_vma_pages > nr_pages)
@@ -6975,20 +6978,24 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
 		return -ENOMEM;
 
 	if (!pgoff) {
+		unsigned long meta_page_padding;
+
 		pages[p++] = virt_to_page(cpu_buffer->meta_page);
 
 		/*
-		 * TODO: Align sub-buffers on their size, once
-		 * vm_insert_pages() supports the zero-page.
+		 * Pad with the zero-page to align the meta-page with the
+		 * sub-buffers.
 		 */
-	} else {
-		/* Skip the meta-page */
-		pgoff--;
+		meta_page_padding = subbuf_pages - 1;
+		while (meta_page_padding-- && p < nr_pages) {
+			unsigned long __maybe_unused zero_addr =
+				vma->vm_start + (PAGE_SIZE * p);
 
-		if (pgoff % subbuf_pages) {
-			err = -EINVAL;
-			goto out;
+			pages[p++] = ZERO_PAGE(zero_addr);
 		}
+	} else {
+		/* Skip the meta-page */
+		pgoff -= subbuf_pages;
 
 		s += pgoff / subbuf_pages;
 	}
diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c
index a9006fa7097e..4bb0192e43f3 100644
--- a/tools/testing/selftests/ring-buffer/map_test.c
+++ b/tools/testing/selftests/ring-buffer/map_test.c
@@ -228,6 +228,20 @@ TEST_F(map, data_mmap)
 	data = mmap(NULL, data_len, PROT_READ, MAP_SHARED,
 		    desc->cpu_fd, meta_len);
 	ASSERT_EQ(data, MAP_FAILED);
+
+	/* Verify meta-page padding */
+	if (desc->meta->meta_page_size > getpagesize()) {
+		void *addr;
+
+		data_len = desc->meta->meta_page_size;
+		data = mmap(NULL, data_len,
+			    PROT_READ, MAP_SHARED, desc->cpu_fd, 0);
+		ASSERT_NE(data, MAP_FAILED);
+
+		addr = (void *)((unsigned long)data + getpagesize());
+		ASSERT_EQ(*((int *)addr), 0);
+		munmap(data, data_len);
+	}
 }
 
 FIXTURE(snapshot) {
-- 
cgit v1.2.3


From 40153505259d8dc0e4ea6889fca5e567c42b76a9 Mon Sep 17 00:00:00 2001
From: "Borislav Petkov (AMD)" <bp@alien8.de>
Date: Wed, 31 Jul 2024 18:05:31 +0200
Subject: Documentation/srso: Document a method for checking safe RET operates
 properly

Add a method to quickly verify whether safe RET operates properly on
a given system using perf tool.

Also, add a selftest which does the same thing.

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20240731160531.28640-1-bp@kernel.org
---
 Documentation/admin-guide/hw-vuln/srso.rst | 69 +++++++++++++++++++++++++++++
 tools/testing/selftests/x86/Makefile       |  2 +-
 tools/testing/selftests/x86/srso.c         | 70 ++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/srso.c

(limited to 'tools')

diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst
index 4bd3ce3ba171..2ad1c05b8c88 100644
--- a/Documentation/admin-guide/hw-vuln/srso.rst
+++ b/Documentation/admin-guide/hw-vuln/srso.rst
@@ -158,3 +158,72 @@ poisoned BTB entry and using that safe one for all function returns.
 In older Zen1 and Zen2, this is accomplished using a reinterpretation
 technique similar to Retbleed one: srso_untrain_ret() and
 srso_safe_ret().
+
+Checking the safe RET mitigation actually works
+-----------------------------------------------
+
+In case one wants to validate whether the SRSO safe RET mitigation works
+on a kernel, one could use two performance counters
+
+* PMC_0xc8 - Count of RET/RET lw retired
+* PMC_0xc9 - Count of RET/RET lw retired mispredicted
+
+and compare the number of RETs retired properly vs those retired
+mispredicted, in kernel mode. Another way of specifying those events
+is::
+
+        # perf list ex_ret_near_ret
+
+        List of pre-defined events (to be used in -e or -M):
+
+        core:
+          ex_ret_near_ret
+               [Retired Near Returns]
+          ex_ret_near_ret_mispred
+               [Retired Near Returns Mispredicted]
+
+Either the command using the event mnemonics::
+
+        # perf stat -e ex_ret_near_ret:k -e ex_ret_near_ret_mispred:k sleep 10s
+
+or using the raw PMC numbers::
+
+        # perf stat -e cpu/event=0xc8,umask=0/k -e cpu/event=0xc9,umask=0/k sleep 10s
+
+should give the same amount. I.e., every RET retired should be
+mispredicted::
+
+        [root@brent: ~/kernel/linux/tools/perf> ./perf stat -e cpu/event=0xc8,umask=0/k -e cpu/event=0xc9,umask=0/k sleep 10s
+
+         Performance counter stats for 'sleep 10s':
+
+                   137,167      cpu/event=0xc8,umask=0/k
+                   137,173      cpu/event=0xc9,umask=0/k
+
+              10.004110303 seconds time elapsed
+
+               0.000000000 seconds user
+               0.004462000 seconds sys
+
+vs the case when the mitigation is disabled (spec_rstack_overflow=off)
+or not functioning properly, showing usually a lot smaller number of
+mispredicted retired RETs vs the overall count of retired RETs during
+a workload::
+
+       [root@brent: ~/kernel/linux/tools/perf> ./perf stat -e cpu/event=0xc8,umask=0/k -e cpu/event=0xc9,umask=0/k sleep 10s
+
+        Performance counter stats for 'sleep 10s':
+
+                  201,627      cpu/event=0xc8,umask=0/k
+                    4,074      cpu/event=0xc9,umask=0/k
+
+             10.003267252 seconds time elapsed
+
+              0.002729000 seconds user
+              0.000000000 seconds sys
+
+Also, there is a selftest which performs the above, go to
+tools/testing/selftests/x86/ and do::
+
+        make srso
+        ./srso
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 5c8757a25998..d51249f14e2f 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -77,7 +77,7 @@ all_32: $(BINARIES_32)
 
 all_64: $(BINARIES_64)
 
-EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64)
+EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64) srso
 
 $(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h
 	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $< $(EXTRA_FILES) -lrt -ldl -lm
diff --git a/tools/testing/selftests/x86/srso.c b/tools/testing/selftests/x86/srso.c
new file mode 100644
index 000000000000..394ec8bdeb00
--- /dev/null
+++ b/tools/testing/selftests/x86/srso.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/perf_event.h>
+#include <cpuid.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+int main(void)
+{
+	struct perf_event_attr ret_attr, mret_attr;
+	long long count_rets, count_rets_mispred;
+	int rrets_fd, mrrets_fd;
+	unsigned int cpuid1_eax, b, c, d;
+
+	__cpuid(1, cpuid1_eax, b, c, d);
+
+	if (cpuid1_eax < 0x00800f00 ||
+	    cpuid1_eax > 0x00afffff) {
+		fprintf(stderr, "This needs to run on a Zen[1-4] machine (CPUID(1).EAX: 0x%x). Exiting...\n", cpuid1_eax);
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&ret_attr, 0, sizeof(struct perf_event_attr));
+	memset(&mret_attr, 0, sizeof(struct perf_event_attr));
+
+	ret_attr.type = mret_attr.type = PERF_TYPE_RAW;
+	ret_attr.size = mret_attr.size = sizeof(struct perf_event_attr);
+	ret_attr.config = 0xc8;
+	mret_attr.config = 0xc9;
+	ret_attr.disabled = mret_attr.disabled = 1;
+	ret_attr.exclude_user = mret_attr.exclude_user = 1;
+	ret_attr.exclude_hv = mret_attr.exclude_hv = 1;
+
+	rrets_fd = syscall(SYS_perf_event_open, &ret_attr, 0, -1, -1, 0);
+	if (rrets_fd == -1) {
+		perror("opening retired RETs fd");
+		exit(EXIT_FAILURE);
+	}
+
+	mrrets_fd = syscall(SYS_perf_event_open, &mret_attr, 0, -1, -1, 0);
+	if (mrrets_fd == -1) {
+		perror("opening retired mispredicted RETs fd");
+		exit(EXIT_FAILURE);
+	}
+
+	ioctl(rrets_fd, PERF_EVENT_IOC_RESET, 0);
+	ioctl(mrrets_fd, PERF_EVENT_IOC_RESET, 0);
+
+	ioctl(rrets_fd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(mrrets_fd, PERF_EVENT_IOC_ENABLE, 0);
+
+	printf("Sleeping for 10 seconds\n");
+	sleep(10);
+
+	ioctl(rrets_fd, PERF_EVENT_IOC_DISABLE, 0);
+	ioctl(mrrets_fd, PERF_EVENT_IOC_DISABLE, 0);
+
+	read(rrets_fd, &count_rets, sizeof(long long));
+	read(mrrets_fd, &count_rets_mispred, sizeof(long long));
+
+	printf("RETs: (%lld retired <-> %lld mispredicted)\n",
+		count_rets, count_rets_mispred);
+	printf("SRSO Safe-RET mitigation works correctly if both counts are almost equal.\n");
+
+	return 0;
+}
-- 
cgit v1.2.3


From 59c330eccee82f9e53421dd8a83b1bc236f4557a Mon Sep 17 00:00:00 2001
From: Boris Sukholitko <boris.sukholitko@broadcom.com>
Date: Thu, 22 Aug 2024 13:35:09 +0300
Subject: selftests: tc_actions: test ingress 2nd vlan push

Add new test checking the correctness of inner vlan flushing to the skb
data when outer vlan tag is added through act_vlan on ingress.

Signed-off-by: Boris Sukholitko <boris.sukholitko@broadcom.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../testing/selftests/net/forwarding/tc_actions.sh | 23 +++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 589629636502..f2f1e99a90b2 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -4,7 +4,7 @@
 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
 	mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
 	gact_trap_test mirred_egress_to_ingress_test \
-	mirred_egress_to_ingress_tcp_test"
+	mirred_egress_to_ingress_tcp_test ingress_2nd_vlan_push"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -244,6 +244,27 @@ mirred_egress_to_ingress_tcp_test()
 	log_test "mirred_egress_to_ingress_tcp ($tcflags)"
 }
 
+ingress_2nd_vlan_push()
+{
+	tc filter add dev $swp1 ingress pref 20 chain 0 handle 20 flower \
+		$tcflags num_of_vlans 1 \
+		action vlan push id 100 protocol 0x8100 action goto chain 5
+	tc filter add dev $swp1 ingress pref 30 chain 5 handle 30 flower \
+		$tcflags num_of_vlans 2 \
+		cvlan_ethtype 0x800 action pass
+
+	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip -Q 10 -q
+
+	tc_check_packets "dev $swp1 ingress" 30 1
+	check_err $? "No double-vlan packets received"
+
+	tc filter del dev $swp1 ingress pref 20 chain 0 handle 20 flower
+	tc filter del dev $swp1 ingress pref 30 chain 5 handle 30 flower
+
+	log_test "ingress_2nd_vlan_push ($tcflags)"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}
-- 
cgit v1.2.3


From 2da44703a54403b9048ba268b2896dc0537a154f Mon Sep 17 00:00:00 2001
From: Boris Sukholitko <boris.sukholitko@broadcom.com>
Date: Thu, 22 Aug 2024 13:35:10 +0300
Subject: selftests: tc_actions: test egress 2nd vlan push

Add new test checking the correctness of inner vlan flushing to the skb
data when outer vlan tag is added through act_vlan on egress.

Signed-off-by: Boris Sukholitko <boris.sukholitko@broadcom.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../testing/selftests/net/forwarding/tc_actions.sh | 25 +++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index f2f1e99a90b2..ea89e558672d 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -4,7 +4,8 @@
 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
 	mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
 	gact_trap_test mirred_egress_to_ingress_test \
-	mirred_egress_to_ingress_tcp_test ingress_2nd_vlan_push"
+	mirred_egress_to_ingress_tcp_test \
+	ingress_2nd_vlan_push egress_2nd_vlan_push"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -265,6 +266,28 @@ ingress_2nd_vlan_push()
 	log_test "ingress_2nd_vlan_push ($tcflags)"
 }
 
+egress_2nd_vlan_push()
+{
+	tc filter add dev $h1 egress pref 20 chain 0 handle 20 flower \
+		$tcflags num_of_vlans 0 \
+		action vlan push id 10 protocol 0x8100 \
+		pipe action vlan push id 100 protocol 0x8100 action goto chain 5
+	tc filter add dev $h1 egress pref 30 chain 5 handle 30 flower \
+		$tcflags num_of_vlans 2 \
+		cvlan_ethtype 0x800 action pass
+
+	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip -q
+
+	tc_check_packets "dev $h1 egress" 30 1
+	check_err $? "No double-vlan packets received"
+
+	tc filter del dev $h1 egress pref 20 chain 0 handle 20 flower
+	tc filter del dev $h1 egress pref 30 chain 5 handle 30 flower
+
+	log_test "egress_2nd_vlan_push ($tcflags)"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}
-- 
cgit v1.2.3


From 59cfdf3f3349019fbfc986a285afcc3873d155f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Aug 2024 10:05:58 -1000
Subject: scx_central: Fix smatch checker warning

ARRAY_ELEM_PTR() is an access macro used to help the BPF verifier not
confused by offseted memory acceeses by yiedling a valid pointer or NULL in
a way that's clear to the verifier. As such, the canonical usage involves
checking NULL return from the macro. Note that in many cases, the NULL
condition can never happen - they're there just to hint the verifier.

In a bpf_loop in scx_central.bpf.c::central_dispatch(), the NULL check was
incorrect in that there was another dereference of the pointer in addition
to the NULL checked access. This worked as the pointer can never be NULL and
the verifier could tell it would never be NULL in this case.

However, this still looks wrong and trips smatch:

  ./tools/sched_ext/scx_central.bpf.c:205 ____central_dispatch()
  error: we previously assumed 'gimme' could be null (see line 201)

  ./tools/sched_ext/scx_central.bpf.c
      195
      196                         if (!scx_bpf_dispatch_nr_slots())
      197                                 break;
      198
      199                         /* central's gimme is never set */
      200                         gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
      201                         if (gimme && !*gimme)
				      ^^^^^
  If gimme is NULL

      202                                 continue;
      203
      204                         if (dispatch_to_cpu(cpu))
  --> 205                                 *gimme = false;

Fix the NULL check so that there are no derefs if NULL. This doesn't change
actual behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: http://lkml.kernel.org/r/<955e1c3c-ace2-4a1d-b246-15b8196038a3@stanley.mountain>
---
 tools/sched_ext/scx_central.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 1d8fd570eaa7..8dd8eb73b6b8 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -198,7 +198,7 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 
 			/* central's gimme is never set */
 			gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
-			if (gimme && !*gimme)
+			if (!gimme || !*gimme)
 				continue;
 
 			if (dispatch_to_cpu(cpu))
-- 
cgit v1.2.3


From 79504a47339cd9a1574d974869aecaba838e1213 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Fri, 23 Aug 2024 23:04:51 +0100
Subject: selftests/net: Clean-up double assignment

Correct copy'n'paste typo: the previous line already initialises get_all
to 1.

Reported-by: Nassiri, Mohammad <mnassiri@ciena.com>
Closes: https://lore.kernel.org/all/DM6PR04MB4202BC58A9FD5BDD24A16E8EC56F2@DM6PR04MB4202.namprd04.prod.outlook.com/
Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-1-05623636fe8c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_ao/lib/sock.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_ao/lib/sock.c b/tools/testing/selftests/net/tcp_ao/lib/sock.c
index 15aeb0963058..0ffda966c677 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/sock.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/sock.c
@@ -379,7 +379,6 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out)
 
 	key_dump[0].nkeys = nr_keys;
 	key_dump[0].get_all = 1;
-	key_dump[0].get_all = 1;
 	err = getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS,
 			 key_dump, &key_dump_sz);
 	if (err) {
-- 
cgit v1.2.3


From 7053e788ded5f8ec589e4507d764d6a894780d6c Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Fri, 23 Aug 2024 23:04:52 +0100
Subject: selftests/net: Provide test_snprintf() helper

Instead of pre-allocating a fixed-sized buffer of TEST_MSG_BUFFER_SIZE
and printing into it, call vsnprintf() with str = NULL, which will
return the needed size of the buffer. This hack is documented in
man 3 vsnprintf.

Essentially, in C++ terms, it re-invents std::stringstream, which is
going to be used to print different tracing paths and formatted strings.
Use it straight away in __test_print() - which is thread-safe version of
printing in selftests.

Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-2-05623636fe8c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_ao/lib/aolib.h | 61 +++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
index fbc7f6111815..78466863435f 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h
+++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
@@ -37,17 +37,58 @@ extern void __test_xfail(const char *buf);
 extern void __test_error(const char *buf);
 extern void __test_skip(const char *buf);
 
-__attribute__((__format__(__printf__, 2, 3)))
-static inline void __test_print(void (*fn)(const char *), const char *fmt, ...)
+static inline char *test_snprintf(const char *fmt, va_list vargs)
 {
-#define TEST_MSG_BUFFER_SIZE 4096
-	char buf[TEST_MSG_BUFFER_SIZE];
-	va_list arg;
-
-	va_start(arg, fmt);
-	vsnprintf(buf, sizeof(buf), fmt, arg);
-	va_end(arg);
-	fn(buf);
+	char *ret = NULL;
+	size_t size = 0;
+	va_list tmp;
+	int n = 0;
+
+	va_copy(tmp, vargs);
+	n = vsnprintf(ret, size, fmt, tmp);
+	if (n < 0)
+		return NULL;
+
+	size = n + 1;
+	ret = malloc(size);
+	if (!ret)
+		return NULL;
+
+	n = vsnprintf(ret, size, fmt, vargs);
+	if (n < 0 || n > size - 1) {
+		free(ret);
+		return NULL;
+	}
+	return ret;
+}
+
+static __printf(1, 2) inline char *test_sprintf(const char *fmt, ...)
+{
+	va_list vargs;
+	char *ret;
+
+	va_start(vargs, fmt);
+	ret = test_snprintf(fmt, vargs);
+	va_end(vargs);
+
+	return ret;
+}
+
+static __printf(2, 3) inline void __test_print(void (*fn)(const char *),
+					       const char *fmt, ...)
+{
+	va_list vargs;
+	char *msg;
+
+	va_start(vargs, fmt);
+	msg = test_snprintf(fmt, vargs);
+	va_end(vargs);
+
+	if (!msg)
+		return;
+
+	fn(msg);
+	free(msg);
 }
 
 #define test_print(fmt, ...)						\
-- 
cgit v1.2.3


From bc2468f98221a194769cac80051da6b86cc39c2f Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Fri, 23 Aug 2024 23:04:53 +0100
Subject: selftests/net: Be consistent in kconfig checks

Most of the functions in tcp-ao lib/ return negative errno or -1 in case
of a failure. That creates inconsistencies in lib/kconfig, which saves
what was the error code. As well as the uninitialized kconfig value is
-1, which also may be the result of a check.

Define KCONFIG_UNKNOWN and save negative return code, rather than
libc-style errno.

Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-3-05623636fe8c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_ao/lib/kconfig.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
index f279ffc3843b..3bf4a7e4b3c9 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
@@ -6,7 +6,7 @@
 #include "aolib.h"
 
 struct kconfig_t {
-	int _errno;		/* the returned error if not supported */
+	int _error;		/* negative errno if not supported */
 	int (*check_kconfig)(int *error);
 };
 
@@ -62,7 +62,7 @@ static int has_tcp_ao(int *err)
 	memcpy(&tmp.addr, &addr, sizeof(addr));
 	*err = 0;
 	if (setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY, &tmp, sizeof(tmp)) < 0) {
-		*err = errno;
+		*err = -errno;
 		if (errno != ENOPROTOOPT)
 			ret = -errno;
 	}
@@ -87,7 +87,7 @@ static int has_tcp_md5(int *err)
 	 */
 	*err = 0;
 	if (test_set_md5(sk, addr_any, 0, -1, DEFAULT_TEST_PASSWORD)) {
-		*err = errno;
+		*err = -errno;
 		if (errno != ENOPROTOOPT && errno == ENOMEM) {
 			test_print("setsockopt(TCP_MD5SIG_EXT): %m");
 			ret = -errno;
@@ -116,13 +116,14 @@ static int has_vrfs(int *err)
 	return ret;
 }
 
+#define KCONFIG_UNKNOWN			1
 static pthread_mutex_t kconfig_lock = PTHREAD_MUTEX_INITIALIZER;
 static struct kconfig_t kconfig[__KCONFIG_LAST__] = {
-	{ -1, has_net_ns },
-	{ -1, has_veth },
-	{ -1, has_tcp_ao },
-	{ -1, has_tcp_md5 },
-	{ -1, has_vrfs },
+	{ KCONFIG_UNKNOWN, has_net_ns },
+	{ KCONFIG_UNKNOWN, has_veth },
+	{ KCONFIG_UNKNOWN, has_tcp_ao },
+	{ KCONFIG_UNKNOWN, has_tcp_md5 },
+	{ KCONFIG_UNKNOWN, has_vrfs },
 };
 
 const char *tests_skip_reason[__KCONFIG_LAST__] = {
@@ -138,11 +139,11 @@ bool kernel_config_has(enum test_needs_kconfig k)
 	bool ret;
 
 	pthread_mutex_lock(&kconfig_lock);
-	if (kconfig[k]._errno == -1) {
-		if (kconfig[k].check_kconfig(&kconfig[k]._errno))
+	if (kconfig[k]._error == KCONFIG_UNKNOWN) {
+		if (kconfig[k].check_kconfig(&kconfig[k]._error))
 			test_error("Failed to initialize kconfig %u", k);
 	}
-	ret = kconfig[k]._errno == 0;
+	ret = kconfig[k]._error == 0;
 	pthread_mutex_unlock(&kconfig_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 8acb1806e8c2846ae619f500a3229a325c3304f5 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Fri, 23 Aug 2024 23:04:54 +0100
Subject: selftests/net: Open /proc/thread-self in open_netns()

It turns to be that open_netns() is called rarely from the child-thread
and more often from parent-thread. Yet, on initialization of kconfig
checks, either of threads may reach kconfig_lock mutex first.
VRF-related checks do create a temporary ksft-check VRF in
an unshare()'d namespace and than setns() back to the original.
As original was opened from "/proc/self/ns/net", it's valid for
thread-leader (parent), but it's invalid for the child, resulting
in the following failure on tests that check has_vrfs() support:
> # ok 54 TCP-AO required on socket + TCP-MD5 key: prefailed as expected: Key was rejected by service
> # not ok 55 # error 381[unsigned-md5.c:24] Failed to add a VRF: -17
> # not ok 56 # error 383[unsigned-md5.c:33] Failed to add a route to VRF: -22: Key was rejected by service
> not ok 1 selftests: net/tcp_ao: unsigned-md5_ipv6 # exit=1

Use "/proc/thread-self/ns/net" which is valid for any thread.

Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-4-05623636fe8c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_ao/lib/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c
index e408b9243b2c..d5212ffe9489 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/setup.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c
@@ -111,7 +111,7 @@ static void sig_int(int signo)
 
 int open_netns(void)
 {
-	const char *netns_path = "/proc/self/ns/net";
+	const char *netns_path = "/proc/thread-self/ns/net";
 	int fd;
 
 	fd = open(netns_path, O_RDONLY);
-- 
cgit v1.2.3


From a9e1693406f96f3739611dd08518c9eda3acecfd Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Fri, 23 Aug 2024 23:04:55 +0100
Subject: selftests/net: Don't forget to close nsfd after switch_save_ns()

The switch_save_ns() helper suppose to help switching to another
namespace for some action and to return back to original namespace.

The fd should be closed.

Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-5-05623636fe8c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_ao/lib/setup.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c
index d5212ffe9489..86a4f6e20450 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/setup.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c
@@ -142,6 +142,13 @@ int switch_save_ns(int new_ns)
 	return ret;
 }
 
+void switch_close_ns(int fd)
+{
+	if (setns(fd, CLONE_NEWNET))
+		test_error("setns()");
+	close(fd);
+}
+
 static int nsfd_outside	= -1;
 static int nsfd_parent	= -1;
 static int nsfd_child	= -1;
@@ -296,7 +303,7 @@ static bool is_optmem_namespaced(void)
 		int old_ns = switch_save_ns(nsfd_child);
 
 		optmem_ns = !access(optmem_file, F_OK);
-		switch_ns(old_ns);
+		switch_close_ns(old_ns);
 	}
 	return !!optmem_ns;
 }
@@ -317,7 +324,7 @@ size_t test_get_optmem(void)
 		test_error("can't read from %s", optmem_file);
 	fclose(foptmem);
 	if (!is_optmem_namespaced())
-		switch_ns(old_ns);
+		switch_close_ns(old_ns);
 	return ret;
 }
 
@@ -339,7 +346,7 @@ static void __test_set_optmem(size_t new, size_t *old)
 		test_error("can't write %zu to %s", new, optmem_file);
 	fclose(foptmem);
 	if (!is_optmem_namespaced())
-		switch_ns(old_ns);
+		switch_close_ns(old_ns);
 }
 
 static void test_revert_optmem(void)
-- 
cgit v1.2.3


From 1c69e1f433990a81b5a5d415d8892ebcaacb985b Mon Sep 17 00:00:00 2001
From: Mohammad Nassiri <mnassiri@ciena.com>
Date: Fri, 23 Aug 2024 23:04:56 +0100
Subject: selftests/tcp_ao: Fix printing format for uint64_t

It's not safe to use '%zu' specifier for printing uint64_t on 32-bit
systems. For uint64_t, we should use the 'PRIu64' macro from
the inttypes.h library. This ensures that the uint64_t is printed
correctly from the selftests regardless of the system architecture.

Signed-off-by: Mohammad Nassiri <mnassiri@ciena.com>
[Added missing spaces in fail/ok messages and uint64_t cast in
 setsockopt-closed, as otherwise it was giving warnings on 64bit.
 And carried it to netdev ml]
Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-6-05623636fe8c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_ao/connect-deny.c      |  4 ++--
 tools/testing/selftests/net/tcp_ao/connect.c           |  4 ++--
 tools/testing/selftests/net/tcp_ao/restore.c           |  8 ++++----
 tools/testing/selftests/net/tcp_ao/self-connect.c      |  4 ++--
 tools/testing/selftests/net/tcp_ao/seq-ext.c           | 12 ++++++------
 tools/testing/selftests/net/tcp_ao/setsockopt-closed.c |  4 ++--
 tools/testing/selftests/net/tcp_ao/unsigned-md5.c      |  4 ++--
 7 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c
index 185a2f6e5ff3..5691f3d00603 100644
--- a/tools/testing/selftests/net/tcp_ao/connect-deny.c
+++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c
@@ -84,10 +84,10 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd,
 	after_cnt = netstat_get_one(cnt_name, NULL);
 
 	if (after_cnt <= before_cnt) {
-		test_fail("%s: %s counter did not increase: %zu <= %zu",
+		test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64,
 				tst_name, cnt_name, after_cnt, before_cnt);
 	} else {
-		test_ok("%s: counter %s increased %zu => %zu",
+		test_ok("%s: counter %s increased %" PRIu64  " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
 
diff --git a/tools/testing/selftests/net/tcp_ao/connect.c b/tools/testing/selftests/net/tcp_ao/connect.c
index 81653b47f303..9eecc4a1072e 100644
--- a/tools/testing/selftests/net/tcp_ao/connect.c
+++ b/tools/testing/selftests/net/tcp_ao/connect.c
@@ -67,14 +67,14 @@ static void *client_fn(void *arg)
 	netstat_free(ns_after);
 
 	if (nr_packets > (after_aogood - before_aogood)) {
-		test_fail("TCPAOGood counter mismatch: %zu > (%zu - %zu)",
+		test_fail("TCPAOGood counter mismatch: %zu > (%" PRIu64 " - %" PRIu64 ")",
 				nr_packets, after_aogood, before_aogood);
 		return NULL;
 	}
 	if (test_tcp_ao_counters_cmp("connect", &ao1, &ao2, TEST_CNT_GOOD))
 		return NULL;
 
-	test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %" PRIu64,
+	test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %zu",
 			before_aogood, ao1.ao_info_pkt_good,
 			ao1.key_cnts[0].pkt_good,
 			after_aogood, ao2.ao_info_pkt_good,
diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c
index 8fdc808df325..7b91d7fde2bc 100644
--- a/tools/testing/selftests/net/tcp_ao/restore.c
+++ b/tools/testing/selftests/net/tcp_ao/restore.c
@@ -71,10 +71,10 @@ static void try_server_run(const char *tst_name, unsigned int port,
 	test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected);
 
 	if (after_cnt <= before_cnt) {
-		test_fail("%s: %s counter did not increase: %zu <= %zu",
+		test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64,
 				tst_name, cnt_name, after_cnt, before_cnt);
 	} else {
-		test_ok("%s: counter %s increased %zu => %zu",
+		test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
 
@@ -183,10 +183,10 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port,
 	test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected);
 
 	if (after_cnt <= before_cnt) {
-		test_fail("%s: %s counter did not increase: %zu <= %zu",
+		test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64,
 				tst_name, cnt_name, after_cnt, before_cnt);
 	} else {
-		test_ok("%s: counter %s increased %zu => %zu",
+		test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
 	synchronize_threads(); /* 3: verified => closed */
diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c
index a5698b0a3718..e56931d38e06 100644
--- a/tools/testing/selftests/net/tcp_ao/self-connect.c
+++ b/tools/testing/selftests/net/tcp_ao/self-connect.c
@@ -87,7 +87,7 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 	netstat_free(ns_after);
 
 	if (after_aogood <= before_aogood) {
-		test_fail("%s: TCPAOGood counter mismatch: %zu <= %zu",
+		test_fail("%s: TCPAOGood counter mismatch: %" PRIu64 " <= %" PRIu64,
 			  tst, after_aogood, before_aogood);
 		close(sk);
 		return;
@@ -148,7 +148,7 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 	netstat_free(ns_after);
 	close(sk);
 	if (after_aogood <= before_aogood) {
-		test_fail("%s: TCPAOGood counter mismatch: %zu <= %zu",
+		test_fail("%s: TCPAOGood counter mismatch: %" PRIu64 " <= %" PRIu64,
 			  tst, after_aogood, before_aogood);
 		return;
 	}
diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c
index ad4e77d6823e..9c7dde7fd776 100644
--- a/tools/testing/selftests/net/tcp_ao/seq-ext.c
+++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c
@@ -134,15 +134,15 @@ static void *server_fn(void *arg)
 	test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD);
 
 	if (after_good <= before_good) {
-		test_fail("TCPAOGood counter did not increase: %zu <= %zu",
+		test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64,
 			  after_good, before_good);
 	} else {
-		test_ok("TCPAOGood counter increased %zu => %zu",
+		test_ok("TCPAOGood counter increased %" PRIu64 " => %" PRIu64,
 			before_good, after_good);
 	}
 	after_bad = netstat_get_one("TCPAOBad", NULL);
 	if (after_bad)
-		test_fail("TCPAOBad counter is non-zero: %zu", after_bad);
+		test_fail("TCPAOBad counter is non-zero: %" PRIu64, after_bad);
 	else
 		test_ok("TCPAOBad counter didn't increase");
 	test_enable_repair(sk);
@@ -219,15 +219,15 @@ static void *client_fn(void *arg)
 	test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD);
 
 	if (after_good <= before_good) {
-		test_fail("TCPAOGood counter did not increase: %zu <= %zu",
+		test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64,
 			  after_good, before_good);
 	} else {
-		test_ok("TCPAOGood counter increased %zu => %zu",
+		test_ok("TCPAOGood counter increased %" PRIu64 " => %" PRIu64,
 			before_good, after_good);
 	}
 	after_bad = netstat_get_one("TCPAOBad", NULL);
 	if (after_bad)
-		test_fail("TCPAOBad counter is non-zero: %zu", after_bad);
+		test_fail("TCPAOBad counter is non-zero: %" PRIu64, after_bad);
 	else
 		test_ok("TCPAOBad counter didn't increase");
 
diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
index 517930f9721b..5eee826c37aa 100644
--- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
+++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
@@ -30,8 +30,8 @@ static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info,
 #define __cmp_ao(member)							\
 do {										\
 	if (info->member != tmp.member) {					\
-		test_fail("%s: getsockopt(): " __stringify(member) " %zu != %zu",	\
-			  tst, (size_t)info->member, (size_t)tmp.member);	\
+		test_fail("%s: getsockopt(): " __stringify(member) " %" PRIu64 " != %" PRIu64,	\
+			  tst, (uint64_t)info->member, (uint64_t)tmp.member);	\
 		return;								\
 	}									\
 } while(0)
diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
index 6b59a652159f..ec2848036341 100644
--- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
+++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
@@ -100,10 +100,10 @@ static void try_accept(const char *tst_name, unsigned int port,
 	after_cnt = netstat_get_one(cnt_name, NULL);
 
 	if (after_cnt <= before_cnt) {
-		test_fail("%s: %s counter did not increase: %zu <= %zu",
+		test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64,
 				tst_name, cnt_name, after_cnt, before_cnt);
 	} else {
-		test_ok("%s: counter %s increased %zu => %zu",
+		test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
 	if (ao_addr)
-- 
cgit v1.2.3


From 044e037051252ca8df07e1355bf4d7964645a6e8 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Fri, 23 Aug 2024 23:04:57 +0100
Subject: selftests/net: Synchronize client/server before counters checks

On tests that are expecting failure the timeout value is
TEST_RETRANSMIT_SEC == 1 second. Which is big enough for most of devices
under tests. But on a particularly slow machine/VM, 1 second might be
not enough for another thread to be scheduled and attempt to connect().
It is not a problem for tests that expect connect() to succeed as
the timeout value for them (TEST_TIMEOUT_SEC) is intentionally bigger.

One obvious way to solve this would be to increase TEST_RETRANSMIT_SEC.
But as all tests would increase the timeouts, that's going to sum up.

But here is less obvious way that keeps timeouts for expected connect()
failures low: just synchronize the two threads, which will assure that
before counter checks the other thread got a chance to run and timeout
on connect(). The expected increase of the related counter for listen()
socket will yet test the expected failure.

Never happens on my machine, but I suppose the majority of netdev's
connect-deny-* flakes [1] are caused by this.

Prevents the following testing issue:
> # selftests: net/tcp_ao: connect-deny_ipv6
> # 1..21
> # # 462[lib/setup.c:243] rand seed 1720905426
> # TAP version 13
> # ok 1 Non-AO server + AO client
> # not ok 2 Non-AO server + AO client: TCPAOKeyNotFound counter did not increase: 0 <= 0
> # ok 3 AO server + Non-AO client
> # ok 4 AO server + Non-AO client: counter TCPAORequired increased 0 => 1
...

[1]: https://netdev-3.bots.linux.dev/vmksft-tcp-ao/results/681741/6-connect-deny-ipv6/stdout
Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-7-05623636fe8c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_ao/connect-deny.c | 3 +++
 tools/testing/selftests/net/tcp_ao/restore.c      | 6 ++++--
 tools/testing/selftests/net/tcp_ao/seq-ext.c      | 6 ++++--
 tools/testing/selftests/net/tcp_ao/unsigned-md5.c | 3 +++
 4 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c
index 5691f3d00603..166ad4549ef2 100644
--- a/tools/testing/selftests/net/tcp_ao/connect-deny.c
+++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c
@@ -71,10 +71,12 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd,
 		}
 	}
 
+	synchronize_threads(); /* before counter checks */
 	if (pwd && test_get_tcp_ao_counters(lsk, &ao_cnt2))
 		test_error("test_get_tcp_ao_counters()");
 
 	close(lsk);
+
 	if (pwd)
 		test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected);
 
@@ -180,6 +182,7 @@ static void try_connect(const char *tst_name, unsigned int port,
 	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
 	ret = _test_connect_socket(sk, this_ip_dest, port, timeout);
 
+	synchronize_threads(); /* before counter checks */
 	if (ret < 0) {
 		if (fault(KEYREJECT) && ret == -EKEYREJECTED) {
 			test_ok("%s: connect() was prevented", tst_name);
diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c
index 7b91d7fde2bc..f6ea2190f43d 100644
--- a/tools/testing/selftests/net/tcp_ao/restore.c
+++ b/tools/testing/selftests/net/tcp_ao/restore.c
@@ -64,6 +64,7 @@ static void try_server_run(const char *tst_name, unsigned int port,
 		else
 			test_ok("%s: server alive", tst_name);
 	}
+	synchronize_threads(); /* 3: counters checks */
 	if (test_get_tcp_ao_counters(sk, &ao2))
 		test_error("test_get_tcp_ao_counters()");
 	after_cnt = netstat_get_one(cnt_name, NULL);
@@ -82,7 +83,7 @@ static void try_server_run(const char *tst_name, unsigned int port,
 	 * Before close() as that will send FIN and move the peer in TCP_CLOSE
 	 * and that will prevent reading AO counters from the peer's socket.
 	 */
-	synchronize_threads(); /* 3: verified => closed */
+	synchronize_threads(); /* 4: verified => closed */
 out:
 	close(sk);
 }
@@ -176,6 +177,7 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port,
 		else
 			test_ok("%s: post-migrate connection is alive", tst_name);
 	}
+	synchronize_threads(); /* 3: counters checks */
 	if (test_get_tcp_ao_counters(sk, &ao2))
 		test_error("test_get_tcp_ao_counters()");
 	after_cnt = netstat_get_one(cnt_name, NULL);
@@ -189,7 +191,7 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port,
 		test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
-	synchronize_threads(); /* 3: verified => closed */
+	synchronize_threads(); /* 4: verified => closed */
 	close(sk);
 }
 
diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c
index 9c7dde7fd776..885866cc193c 100644
--- a/tools/testing/selftests/net/tcp_ao/seq-ext.c
+++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c
@@ -116,7 +116,7 @@ static void *server_fn(void *arg)
 	sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest,
 			     client_new_port, &ao1);
 
-	synchronize_threads(); /* 5: verify counters during SEQ-number rollover */
+	synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */
 	bytes = test_server_run(sk, quota, TEST_TIMEOUT_SEC);
 	if (bytes != quota) {
 		if (bytes > 0)
@@ -127,6 +127,7 @@ static void *server_fn(void *arg)
 		test_ok("server alive");
 	}
 
+	synchronize_threads(); /* 6: verify counters after SEQ-number rollover */
 	if (test_get_tcp_ao_counters(sk, &ao2))
 		test_error("test_get_tcp_ao_counters()");
 	after_good = netstat_get_one("TCPAOGood", NULL);
@@ -206,12 +207,13 @@ static void *client_fn(void *arg)
 	sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest,
 			     test_server_port + 1, &ao1);
 
-	synchronize_threads(); /* 5: verify counters during SEQ-number rollover */
+	synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */
 	if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC))
 		test_fail("post-migrate verify failed");
 	else
 		test_ok("post-migrate connection alive");
 
+	synchronize_threads(); /* 5: verify counters after SEQ-number rollover */
 	if (test_get_tcp_ao_counters(sk, &ao2))
 		test_error("test_get_tcp_ao_counters()");
 	after_good = netstat_get_one("TCPAOGood", NULL);
diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
index ec2848036341..02346b58efbd 100644
--- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
+++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
@@ -70,6 +70,7 @@ static void try_accept(const char *tst_name, unsigned int port,
 
 	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
 	err = test_wait_fd(lsk, timeout, 0);
+	synchronize_threads(); /* connect()/accept() timeouts */
 	if (err == -ETIMEDOUT) {
 		if (!fault(TIMEOUT))
 			test_fail("timed out for accept()");
@@ -283,6 +284,7 @@ static void try_connect(const char *tst_name, unsigned int port,
 	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
 	ret = _test_connect_socket(sk, this_ip_dest, port, timeout);
 
+	synchronize_threads(); /* connect()/accept() timeouts */
 	if (ret < 0) {
 		if (fault(KEYREJECT) && ret == -EKEYREJECTED)
 			test_ok("%s: connect() was prevented", tst_name);
@@ -451,6 +453,7 @@ static void try_to_add(const char *tst_name, unsigned int port,
 	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
 	ret = _test_connect_socket(sk, this_ip_dest, port, timeout);
 
+	synchronize_threads(); /* connect()/accept() timeouts */
 	if (ret <= 0) {
 		test_error("%s: connect() returned %d", tst_name, ret);
 		goto out;
-- 
cgit v1.2.3


From 586d87021f224b0434372f5b4418216e29b0a544 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Fri, 23 Aug 2024 23:04:58 +0100
Subject: selftests/net: Add trace events matching to tcp_ao

Setup trace points, add a new ftrace instance in order to not interfere
with the rest of the system, filtering by net namespace cookies.
Raise a new background thread that parses trace_pipe, matches them with
the list of expected events.

Wiring up trace events to selftests provides another insight if there is
anything unexpected happining in the tcp-ao code (i.e. key rotation when
it's not expected).

Note: in real programs libtraceevent should be used instead of this
manual labor of setting ftrace up and parsing. I'm not using it here
as I don't want to have an .so library dependency that one would have to
bring into VM or DUT (Device Under Test). Please, don't copy it over
into any real world programs, that aren't tests.

Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-8-05623636fe8c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_ao/Makefile        |   3 +-
 tools/testing/selftests/net/tcp_ao/bench-lookups.c |   2 +-
 tools/testing/selftests/net/tcp_ao/config          |   1 +
 tools/testing/selftests/net/tcp_ao/connect-deny.c  |  18 +-
 tools/testing/selftests/net/tcp_ao/connect.c       |   2 +-
 tools/testing/selftests/net/tcp_ao/icmps-discard.c |   2 +-
 .../testing/selftests/net/tcp_ao/key-management.c  |  18 +-
 tools/testing/selftests/net/tcp_ao/lib/aolib.h     | 119 +++++
 .../testing/selftests/net/tcp_ao/lib/ftrace-tcp.c  | 559 +++++++++++++++++++++
 tools/testing/selftests/net/tcp_ao/lib/ftrace.c    | 543 ++++++++++++++++++++
 tools/testing/selftests/net/tcp_ao/lib/kconfig.c   |   8 +
 tools/testing/selftests/net/tcp_ao/lib/setup.c     |   2 +-
 tools/testing/selftests/net/tcp_ao/lib/utils.c     |  26 +
 tools/testing/selftests/net/tcp_ao/restore.c       |  16 +-
 tools/testing/selftests/net/tcp_ao/rst.c           |   2 +-
 tools/testing/selftests/net/tcp_ao/self-connect.c  |  15 +-
 tools/testing/selftests/net/tcp_ao/seq-ext.c       |  10 +-
 .../selftests/net/tcp_ao/setsockopt-closed.c       |   2 +-
 tools/testing/selftests/net/tcp_ao/unsigned-md5.c  |  28 +-
 19 files changed, 1358 insertions(+), 18 deletions(-)
 create mode 100644 tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c
 create mode 100644 tools/testing/selftests/net/tcp_ao/lib/ftrace.c

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile
index bd88b90b902b..5b0205c70c39 100644
--- a/tools/testing/selftests/net/tcp_ao/Makefile
+++ b/tools/testing/selftests/net/tcp_ao/Makefile
@@ -31,7 +31,8 @@ CFLAGS	+= $(KHDR_INCLUDES)
 CFLAGS	+= -iquote ./lib/ -I ../../../../include/
 
 # Library
-LIBSRC	:= kconfig.c netlink.c proc.c repair.c setup.c sock.c utils.c
+LIBSRC	:= ftrace.c ftrace-tcp.c kconfig.c netlink.c
+LIBSRC	+= proc.c repair.c setup.c sock.c utils.c
 LIBOBJ	:= $(LIBSRC:%.c=$(LIBDIR)/%.o)
 EXTRA_CLEAN += $(LIBOBJ) $(LIB)
 
diff --git a/tools/testing/selftests/net/tcp_ao/bench-lookups.c b/tools/testing/selftests/net/tcp_ao/bench-lookups.c
index a1e6e007c291..6736484996a3 100644
--- a/tools/testing/selftests/net/tcp_ao/bench-lookups.c
+++ b/tools/testing/selftests/net/tcp_ao/bench-lookups.c
@@ -355,6 +355,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(30, server_fn, client_fn);
+	test_init(31, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/config b/tools/testing/selftests/net/tcp_ao/config
index d3277a9de987..3605e38711cb 100644
--- a/tools/testing/selftests/net/tcp_ao/config
+++ b/tools/testing/selftests/net/tcp_ao/config
@@ -7,4 +7,5 @@ CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_NET_VRF=y
 CONFIG_TCP_AO=y
 CONFIG_TCP_MD5SIG=y
+CONFIG_TRACEPOINTS=y
 CONFIG_VETH=m
diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c
index 166ad4549ef2..d418162d335f 100644
--- a/tools/testing/selftests/net/tcp_ao/connect-deny.c
+++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c
@@ -215,30 +215,44 @@ out:
 
 static void *client_fn(void *arg)
 {
-	union tcp_addr wrong_addr, network_addr;
+	union tcp_addr wrong_addr, network_addr, addr_any = {};
 	unsigned int port = test_server_port;
 
 	if (inet_pton(TEST_FAMILY, TEST_WRONG_IP, &wrong_addr) != 1)
 		test_error("Can't convert ip address %s", TEST_WRONG_IP);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Non-AO server + AO client", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_hash_event_expect(TCP_HASH_AO_REQUIRED, this_ip_addr, this_ip_dest,
+				-1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO server + Non-AO client", port++, NULL,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Wrong password", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Wrong rcv id", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_sk_expect(TCP_AO_SYNACK_NO_KEY, this_ip_dest, addr_any,
+				 port, 0, 100, 100);
 	try_connect("Wrong snd id", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_expect(TCP_AO_WRONG_MACLEN, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Different maclen", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Server: Wrong addr", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
@@ -262,6 +276,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(21, server_fn, client_fn);
+	test_init(22, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/connect.c b/tools/testing/selftests/net/tcp_ao/connect.c
index 9eecc4a1072e..f1d8d29e393f 100644
--- a/tools/testing/selftests/net/tcp_ao/connect.c
+++ b/tools/testing/selftests/net/tcp_ao/connect.c
@@ -85,6 +85,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(1, server_fn, client_fn);
+	test_init(2, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/icmps-discard.c b/tools/testing/selftests/net/tcp_ao/icmps-discard.c
index d69bcba3c929..a1614f0d8c44 100644
--- a/tools/testing/selftests/net/tcp_ao/icmps-discard.c
+++ b/tools/testing/selftests/net/tcp_ao/icmps-discard.c
@@ -444,6 +444,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(3, server_fn, client_fn);
+	test_init(4, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/key-management.c b/tools/testing/selftests/net/tcp_ao/key-management.c
index 24e62120b792..d4385b52c10b 100644
--- a/tools/testing/selftests/net/tcp_ao/key-management.c
+++ b/tools/testing/selftests/net/tcp_ao/key-management.c
@@ -965,7 +965,7 @@ static void end_client(const char *tst_name, int sk, unsigned int nr_keys,
 	synchronize_threads(); /* 5: counters */
 }
 
-static void try_unmatched_keys(int sk, int *rnext_index)
+static void try_unmatched_keys(int sk, int *rnext_index, unsigned int port)
 {
 	struct test_key *key;
 	unsigned int i = 0;
@@ -1013,6 +1013,9 @@ static void try_unmatched_keys(int sk, int *rnext_index)
 		test_error("all keys on server match the client");
 	if (test_set_key(sk, -1, key->server_keyid))
 		test_error("Can't change the current key");
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_addr, this_ip_dest,
+			      -1, port, 0, -1, -1, -1, -1, -1,
+			      -1, key->server_keyid, -1);
 	if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC))
 		test_fail("verify failed");
 	*rnext_index = i;
@@ -1054,6 +1057,10 @@ static void check_current_back(const char *tst_name, unsigned int port,
 		return;
 	if (test_set_key(sk, collection.keys[rotate_to_index].client_keyid, -1))
 		test_error("Can't change the current key");
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_dest, this_ip_addr,
+			      port, -1, 0, -1, -1, -1, -1, -1,
+			      collection.keys[rotate_to_index].client_keyid,
+			      collection.keys[current_index].client_keyid, -1);
 	if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC))
 		test_fail("verify failed");
 	/* There is a race here: between setting the current_key with
@@ -1085,6 +1092,11 @@ static void roll_over_keys(const char *tst_name, unsigned int port,
 	for (i = rnext_index + 1; rotations > 0; i++, rotations--) {
 		if (i >= collection.nr_keys)
 			i = 0;
+		trace_ao_event_expect(TCP_AO_RNEXT_REQUEST,
+				this_ip_addr, this_ip_dest,
+				-1, port, 0, -1, -1, -1, -1, -1,
+				i == 0 ? -1 : collection.keys[i - 1].server_keyid,
+				collection.keys[i].server_keyid, -1);
 		if (test_set_key(sk, -1, collection.keys[i].server_keyid))
 			test_error("Can't change the Rnext key");
 		if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) {
@@ -1124,7 +1136,7 @@ static void try_client_match(const char *tst_name, unsigned int port,
 				 rnext_index, msg_len, nr_packets);
 	if (sk < 0)
 		return;
-	try_unmatched_keys(sk, &rnext_index);
+	try_unmatched_keys(sk, &rnext_index, port);
 	end_client(tst_name, sk, nr_keys, current_index, rnext_index, NULL);
 }
 
@@ -1181,6 +1193,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(120, server_fn, client_fn);
+	test_init(121, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
index 78466863435f..db44e77428dd 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h
+++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
@@ -144,6 +144,7 @@ enum test_needs_kconfig {
 	KCONFIG_TCP_AO,			/* required */
 	KCONFIG_TCP_MD5,		/* optional, for TCP-MD5 features */
 	KCONFIG_NET_VRF,		/* optional, for L3/VRF testing */
+	KCONFIG_FTRACE,			/* optional, for tracepoints checks */
 	__KCONFIG_LAST__
 };
 extern bool kernel_config_has(enum test_needs_kconfig k);
@@ -183,6 +184,8 @@ static inline void test_init2(unsigned int ntests,
 	__test_init(ntests, family, prefix, taddr1, taddr2, peer1, peer2);
 }
 extern void test_add_destructor(void (*d)(void));
+extern void test_init_ftrace(int nsfd1, int nsfd2);
+extern int test_setup_tracing(void);
 
 /* To adjust optmem socket limit, approximately estimate a number,
  * that is bigger than sizeof(struct tcp_ao_key).
@@ -257,12 +260,17 @@ static inline void test_init(unsigned int ntests,
 }
 extern void synchronize_threads(void);
 extern void switch_ns(int fd);
+extern int switch_save_ns(int fd);
+extern void switch_close_ns(int fd);
 
 extern __thread union tcp_addr this_ip_addr;
 extern __thread union tcp_addr this_ip_dest;
 extern int test_family;
 
 extern void randomize_buffer(void *buf, size_t buflen);
+extern __printf(3, 4) int test_echo(const char *fname, bool append,
+				    const char *fmt, ...);
+
 extern int open_netns(void);
 extern int unshare_open_netns(void);
 extern const char veth_name[];
@@ -643,4 +651,115 @@ static inline int test_add_repaired_key(int sk,
 	return test_verify_socket_key(sk, &tmp);
 }
 
+#define DEFAULT_FTRACE_BUFFER_KB	10000
+#define DEFAULT_TRACER_LINES_ARR	200
+struct test_ftracer;
+extern uint64_t ns_cookie1, ns_cookie2;
+
+enum ftracer_op {
+	FTRACER_LINE_DISCARD = 0,
+	FTRACER_LINE_PRESERVE,
+	FTRACER_EXIT,
+};
+
+extern struct test_ftracer *create_ftracer(const char *name,
+		enum ftracer_op (*process_line)(const char *line),
+		void (*destructor)(struct test_ftracer *tracer),
+		bool (*expecting_more)(void),
+		size_t lines_buf_sz, size_t buffer_size_kb);
+extern int setup_trace_event(struct test_ftracer *tracer,
+			     const char *event, const char *filter);
+extern void destroy_ftracer(struct test_ftracer *tracer);
+extern const size_t tracer_get_savedlines_nr(struct test_ftracer *tracer);
+extern const char **tracer_get_savedlines(struct test_ftracer *tracer);
+
+enum trace_events {
+	/* TCP_HASH_EVENT */
+	TCP_HASH_BAD_HEADER = 0,
+	TCP_HASH_MD5_REQUIRED,
+	TCP_HASH_MD5_UNEXPECTED,
+	TCP_HASH_MD5_MISMATCH,
+	TCP_HASH_AO_REQUIRED,
+	/* TCP_AO_EVENT */
+	TCP_AO_HANDSHAKE_FAILURE,
+	TCP_AO_WRONG_MACLEN,
+	TCP_AO_MISMATCH,
+	TCP_AO_KEY_NOT_FOUND,
+	TCP_AO_RNEXT_REQUEST,
+	/* TCP_AO_EVENT_SK */
+	TCP_AO_SYNACK_NO_KEY,
+	/* TCP_AO_EVENT_SNE */
+	TCP_AO_SND_SNE_UPDATE,
+	TCP_AO_RCV_SNE_UPDATE,
+	__MAX_TRACE_EVENTS
+};
+
+extern int __trace_event_expect(enum trace_events type, int family,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port, int L3index,
+				int fin, int syn, int rst, int psh, int ack,
+				int keyid, int rnext, int maclen, int sne);
+
+static inline void trace_hash_event_expect(enum trace_events type,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port, int L3index,
+				int fin, int syn, int rst, int psh, int ack)
+{
+	int err;
+
+	err = __trace_event_expect(type, TEST_FAMILY, src, dst,
+				   src_port, dst_port, L3index,
+				   fin, syn, rst, psh, ack,
+				   -1, -1, -1, -1);
+	if (err)
+		test_error("Couldn't add a trace event: %d", err);
+}
+
+static inline void trace_ao_event_expect(enum trace_events type,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port, int L3index,
+				int fin, int syn, int rst, int psh, int ack,
+				int keyid, int rnext, int maclen)
+{
+	int err;
+
+	err = __trace_event_expect(type, TEST_FAMILY, src, dst,
+				   src_port, dst_port, L3index,
+				   fin, syn, rst, psh, ack,
+				   keyid, rnext, maclen, -1);
+	if (err)
+		test_error("Couldn't add a trace event: %d", err);
+}
+
+static inline void trace_ao_event_sk_expect(enum trace_events type,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port,
+				int keyid, int rnext)
+{
+	int err;
+
+	err = __trace_event_expect(type, TEST_FAMILY, src, dst,
+				   src_port, dst_port, -1,
+				   -1, -1, -1, -1, -1,
+				   keyid, rnext, -1, -1);
+	if (err)
+		test_error("Couldn't add a trace event: %d", err);
+}
+
+static inline void trace_ao_event_sne_expect(enum trace_events type,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port, int sne)
+{
+	int err;
+
+	err = __trace_event_expect(type, TEST_FAMILY, src, dst,
+				   src_port, dst_port, -1,
+				   -1, -1, -1, -1, -1,
+				   -1, -1, -1, sne);
+	if (err)
+		test_error("Couldn't add a trace event: %d", err);
+}
+
+extern int setup_aolib_ftracer(void);
+
 #endif /* _AOLIB_H_ */
diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c
new file mode 100644
index 000000000000..24380c68fec6
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <inttypes.h>
+#include <pthread.h>
+#include "aolib.h"
+
+static const char *trace_event_names[__MAX_TRACE_EVENTS] = {
+	/* TCP_HASH_EVENT */
+	"tcp_hash_bad_header",
+	"tcp_hash_md5_required",
+	"tcp_hash_md5_unexpected",
+	"tcp_hash_md5_mismatch",
+	"tcp_hash_ao_required",
+	/* TCP_AO_EVENT */
+	"tcp_ao_handshake_failure",
+	"tcp_ao_wrong_maclen",
+	"tcp_ao_mismatch",
+	"tcp_ao_key_not_found",
+	"tcp_ao_rnext_request",
+	/* TCP_AO_EVENT_SK */
+	"tcp_ao_synack_no_key",
+	/* TCP_AO_EVENT_SNE */
+	"tcp_ao_snd_sne_update",
+	"tcp_ao_rcv_sne_update"
+};
+
+struct expected_trace_point {
+	/* required */
+	enum trace_events type;
+	int family;
+	union tcp_addr src;
+	union tcp_addr dst;
+
+	/* optional */
+	int src_port;
+	int dst_port;
+	int L3index;
+
+	int fin;
+	int syn;
+	int rst;
+	int psh;
+	int ack;
+
+	int keyid;
+	int rnext;
+	int maclen;
+	int sne;
+
+	size_t matched;
+};
+
+static struct expected_trace_point *exp_tps;
+static size_t exp_tps_nr;
+static size_t exp_tps_size;
+static pthread_mutex_t exp_tps_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int __trace_event_expect(enum trace_events type, int family,
+			 union tcp_addr src, union tcp_addr dst,
+			 int src_port, int dst_port, int L3index,
+			 int fin, int syn, int rst, int psh, int ack,
+			 int keyid, int rnext, int maclen, int sne)
+{
+	struct expected_trace_point new_tp = {
+		.type           = type,
+		.family         = family,
+		.src            = src,
+		.dst            = dst,
+		.src_port       = src_port,
+		.dst_port       = dst_port,
+		.L3index        = L3index,
+		.fin            = fin,
+		.syn            = syn,
+		.rst            = rst,
+		.psh            = psh,
+		.ack            = ack,
+		.keyid          = keyid,
+		.rnext          = rnext,
+		.maclen         = maclen,
+		.sne            = sne,
+		.matched        = 0,
+	};
+	int ret = 0;
+
+	if (!kernel_config_has(KCONFIG_FTRACE))
+		return 0;
+
+	pthread_mutex_lock(&exp_tps_mutex);
+	if (exp_tps_nr == exp_tps_size) {
+		struct expected_trace_point *tmp;
+
+		if (exp_tps_size == 0)
+			exp_tps_size = 10;
+		else
+			exp_tps_size = exp_tps_size * 1.6;
+
+		tmp = reallocarray(exp_tps, exp_tps_size, sizeof(exp_tps[0]));
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		exp_tps = tmp;
+	}
+	exp_tps[exp_tps_nr] = new_tp;
+	exp_tps_nr++;
+out:
+	pthread_mutex_unlock(&exp_tps_mutex);
+	return ret;
+}
+
+static void free_expected_events(void)
+{
+	/* We're from the process destructor - not taking the mutex */
+	exp_tps_size = 0;
+	exp_tps = NULL;
+	free(exp_tps);
+}
+
+struct trace_point {
+	int family;
+	union tcp_addr src;
+	union tcp_addr dst;
+	unsigned int src_port;
+	unsigned int dst_port;
+	int L3index;
+	unsigned int fin:1,
+		     syn:1,
+		     rst:1,
+		     psh:1,
+		     ack:1;
+
+	unsigned int keyid;
+	unsigned int rnext;
+	unsigned int maclen;
+
+	unsigned int sne;
+};
+
+static bool lookup_expected_event(int event_type, struct trace_point *e)
+{
+	size_t i;
+
+	pthread_mutex_lock(&exp_tps_mutex);
+	for (i = 0; i < exp_tps_nr; i++) {
+		struct expected_trace_point *p = &exp_tps[i];
+		size_t sk_size;
+
+		if (p->type != event_type)
+			continue;
+		if (p->family != e->family)
+			continue;
+		if (p->family == AF_INET)
+			sk_size = sizeof(p->src.a4);
+		else
+			sk_size = sizeof(p->src.a6);
+		if (memcmp(&p->src, &e->src, sk_size))
+			continue;
+		if (memcmp(&p->dst, &e->dst, sk_size))
+			continue;
+		if (p->src_port >= 0 && p->src_port != e->src_port)
+			continue;
+		if (p->dst_port >= 0 && p->dst_port != e->dst_port)
+			continue;
+		if (p->L3index >= 0 && p->L3index != e->L3index)
+			continue;
+
+		if (p->fin >= 0 && p->fin != e->fin)
+			continue;
+		if (p->syn >= 0 && p->syn != e->syn)
+			continue;
+		if (p->rst >= 0 && p->rst != e->rst)
+			continue;
+		if (p->psh >= 0 && p->psh != e->psh)
+			continue;
+		if (p->ack >= 0 && p->ack != e->ack)
+			continue;
+
+		if (p->keyid >= 0 && p->keyid != e->keyid)
+			continue;
+		if (p->rnext >= 0 && p->rnext != e->rnext)
+			continue;
+		if (p->maclen >= 0 && p->maclen != e->maclen)
+			continue;
+		if (p->sne >= 0 && p->sne != e->sne)
+			continue;
+		p->matched++;
+		pthread_mutex_unlock(&exp_tps_mutex);
+		return true;
+	}
+	pthread_mutex_unlock(&exp_tps_mutex);
+	return false;
+}
+
+static int check_event_type(const char *line)
+{
+	size_t i;
+
+	/*
+	 * This should have been a set or hashmap, but it's a selftest,
+	 * so... KISS.
+	 */
+	for (i = 0; i < __MAX_TRACE_EVENTS; i++) {
+		if (!strncmp(trace_event_names[i], line, strlen(trace_event_names[i])))
+			return i;
+	}
+	return -1;
+}
+
+static bool event_has_flags(enum trace_events event)
+{
+	switch (event) {
+	case TCP_HASH_BAD_HEADER:
+	case TCP_HASH_MD5_REQUIRED:
+	case TCP_HASH_MD5_UNEXPECTED:
+	case TCP_HASH_MD5_MISMATCH:
+	case TCP_HASH_AO_REQUIRED:
+	case TCP_AO_HANDSHAKE_FAILURE:
+	case TCP_AO_WRONG_MACLEN:
+	case TCP_AO_MISMATCH:
+	case TCP_AO_KEY_NOT_FOUND:
+	case TCP_AO_RNEXT_REQUEST:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int tracer_ip_split(int family, char *src, char **addr, char **port)
+{
+	char *p;
+
+	if (family == AF_INET) {
+		/* fomat is <addr>:port, i.e.: 10.0.254.1:7015 */
+		*addr = src;
+		p = strchr(src, ':');
+		if (!p) {
+			test_print("Couldn't parse trace event addr:port %s", src);
+			return -EINVAL;
+		}
+		*p++ = '\0';
+		*port = p;
+		return 0;
+	}
+	if (family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	/* format is [<addr>]:port, i.e.: [2001:db8:254::1]:7013 */
+	*addr = strchr(src, '[');
+	p = strchr(src, ']');
+
+	if (!p || !*addr) {
+		test_print("Couldn't parse trace event [addr]:port %s", src);
+		return -EINVAL;
+	}
+
+	*addr = *addr + 1;      /* '[' */
+	*p++ = '\0';            /* ']' */
+	if (*p != ':') {
+		test_print("Couldn't parse trace event :port %s", p);
+		return -EINVAL;
+	}
+	*p++ = '\0';            /* ':' */
+	*port = p;
+	return 0;
+}
+
+static int tracer_scan_address(int family, char *src,
+			       union tcp_addr *dst, unsigned int *port)
+{
+	char *addr, *port_str;
+	int ret;
+
+	ret = tracer_ip_split(family, src, &addr, &port_str);
+	if (ret)
+		return ret;
+
+	if (inet_pton(family, addr, dst) != 1) {
+		test_print("Couldn't parse trace event addr %s", addr);
+		return -EINVAL;
+	}
+	errno = 0;
+	*port = (unsigned int)strtoul(port_str, NULL, 10);
+	if (errno != 0) {
+		test_print("Couldn't parse trace event port %s", port_str);
+		return -errno;
+	}
+	return 0;
+}
+
+static int tracer_scan_event(const char *line, enum trace_events event,
+			     struct trace_point *out)
+{
+	char *src = NULL, *dst = NULL, *family = NULL;
+	char fin, syn, rst, psh, ack;
+	int nr_matched, ret = 0;
+	uint64_t netns_cookie;
+
+	switch (event) {
+	case TCP_HASH_BAD_HEADER:
+	case TCP_HASH_MD5_REQUIRED:
+	case TCP_HASH_MD5_UNEXPECTED:
+	case TCP_HASH_MD5_MISMATCH:
+	case TCP_HASH_AO_REQUIRED: {
+		nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms L3index=%d [%c%c%c%c%c]",
+				    &netns_cookie, &family,
+				    &src, &dst, &out->L3index,
+				    &fin, &syn, &rst, &psh, &ack);
+		if (nr_matched != 10)
+			test_print("Couldn't parse trace event, matched = %d/10",
+				   nr_matched);
+		break;
+	}
+	case TCP_AO_HANDSHAKE_FAILURE:
+	case TCP_AO_WRONG_MACLEN:
+	case TCP_AO_MISMATCH:
+	case TCP_AO_KEY_NOT_FOUND:
+	case TCP_AO_RNEXT_REQUEST: {
+		nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms L3index=%d [%c%c%c%c%c] keyid=%u rnext=%u maclen=%u",
+				    &netns_cookie, &family,
+				    &src, &dst, &out->L3index,
+				    &fin, &syn, &rst, &psh, &ack,
+				    &out->keyid, &out->rnext, &out->maclen);
+		if (nr_matched != 13)
+			test_print("Couldn't parse trace event, matched = %d/13",
+				   nr_matched);
+		break;
+	}
+	case TCP_AO_SYNACK_NO_KEY: {
+		nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms keyid=%u rnext=%u",
+				    &netns_cookie, &family,
+				    &src, &dst, &out->keyid, &out->rnext);
+		if (nr_matched != 6)
+			test_print("Couldn't parse trace event, matched = %d/6",
+				   nr_matched);
+		break;
+	}
+	case TCP_AO_SND_SNE_UPDATE:
+	case TCP_AO_RCV_SNE_UPDATE: {
+		nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms sne=%u",
+				    &netns_cookie, &family,
+				    &src, &dst, &out->sne);
+		if (nr_matched != 5)
+			test_print("Couldn't parse trace event, matched = %d/5",
+				   nr_matched);
+		break;
+	}
+	default:
+		return -1;
+	}
+
+	if (family) {
+		if (!strcmp(family, "AF_INET")) {
+			out->family = AF_INET;
+		} else if (!strcmp(family, "AF_INET6")) {
+			out->family = AF_INET6;
+		} else {
+			test_print("Couldn't parse trace event family %s", family);
+			ret = -EINVAL;
+			goto out_free;
+		}
+	}
+
+	if (event_has_flags(event)) {
+		out->fin = (fin == 'F');
+		out->syn = (syn == 'S');
+		out->rst = (rst == 'R');
+		out->psh = (psh == 'P');
+		out->ack = (ack == '.');
+
+		if ((fin != 'F' && fin != ' ') ||
+		    (syn != 'S' && syn != ' ') ||
+		    (rst != 'R' && rst != ' ') ||
+		    (psh != 'P' && psh != ' ') ||
+		    (ack != '.' && ack != ' ')) {
+			test_print("Couldn't parse trace event flags %c%c%c%c%c",
+				   fin, syn, rst, psh, ack);
+			ret = -EINVAL;
+			goto out_free;
+		}
+	}
+
+	if (src && tracer_scan_address(out->family, src, &out->src, &out->src_port)) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	if (dst && tracer_scan_address(out->family, dst, &out->dst, &out->dst_port)) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	if (netns_cookie != ns_cookie1 && netns_cookie != ns_cookie2) {
+		test_print("Net namespace filter for trace event didn't work: %" PRIu64 " != %" PRIu64 " OR %" PRIu64,
+			   netns_cookie, ns_cookie1, ns_cookie2);
+		ret = -EINVAL;
+	}
+
+out_free:
+	free(src);
+	free(dst);
+	free(family);
+	return ret;
+}
+
+static enum ftracer_op aolib_tracer_process_event(const char *line)
+{
+	int event_type = check_event_type(line);
+	struct trace_point tmp = {};
+
+	if (event_type < 0)
+		return FTRACER_LINE_PRESERVE;
+
+	if (tracer_scan_event(line, event_type, &tmp))
+		return FTRACER_LINE_PRESERVE;
+
+	return lookup_expected_event(event_type, &tmp) ?
+		FTRACER_LINE_DISCARD : FTRACER_LINE_PRESERVE;
+}
+
+static void dump_trace_event(struct expected_trace_point *e)
+{
+	char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+
+	if (!inet_ntop(e->family, &e->src, src, INET6_ADDRSTRLEN))
+		test_error("inet_ntop()");
+	if (!inet_ntop(e->family, &e->dst, dst, INET6_ADDRSTRLEN))
+		test_error("inet_ntop()");
+	test_print("trace event filter %s [%s:%d => %s:%d, L3index %d, flags: %s%s%s%s%s, keyid: %d, rnext: %d, maclen: %d, sne: %d] = %zu",
+		   trace_event_names[e->type],
+		   src, e->src_port, dst, e->dst_port, e->L3index,
+		   (e->fin > 0) ? "F" : (e->fin == 0) ? "!F" : "",
+		   (e->syn > 0) ? "S" : (e->syn == 0) ? "!S" : "",
+		   (e->rst > 0) ? "R" : (e->rst == 0) ? "!R" : "",
+		   (e->psh > 0) ? "P" : (e->psh == 0) ? "!P" : "",
+		   (e->ack > 0) ? "." : (e->ack == 0) ? "!." : "",
+		   e->keyid, e->rnext, e->maclen, e->sne, e->matched);
+}
+
+static void print_match_stats(bool unexpected_events)
+{
+	size_t matches_per_type[__MAX_TRACE_EVENTS] = {};
+	bool expected_but_none = false;
+	size_t i, total_matched = 0;
+	char *stat_line = NULL;
+
+	for (i = 0; i < exp_tps_nr; i++) {
+		struct expected_trace_point *e = &exp_tps[i];
+
+		total_matched += e->matched;
+		matches_per_type[e->type] += e->matched;
+		if (!e->matched)
+			expected_but_none = true;
+	}
+	for (i = 0; i < __MAX_TRACE_EVENTS; i++) {
+		if (!matches_per_type[i])
+			continue;
+		stat_line = test_sprintf("%s%s[%zu] ", stat_line ?: "",
+					 trace_event_names[i],
+					 matches_per_type[i]);
+		if (!stat_line)
+			test_error("test_sprintf()");
+	}
+
+	if (unexpected_events || expected_but_none) {
+		for (i = 0; i < exp_tps_nr; i++)
+			dump_trace_event(&exp_tps[i]);
+	}
+
+	if (unexpected_events)
+		return;
+
+	if (expected_but_none)
+		test_fail("Some trace events were expected, but didn't occur");
+	else if (total_matched)
+		test_ok("Trace events matched expectations: %zu %s",
+			total_matched, stat_line);
+	else
+		test_ok("No unexpected trace events during the test run");
+}
+
+#define dump_events(fmt, ...)                           \
+	__test_print(__test_msg, fmt, ##__VA_ARGS__)
+static void check_free_events(struct test_ftracer *tracer)
+{
+	const char **lines;
+	size_t nr;
+
+	if (!kernel_config_has(KCONFIG_FTRACE)) {
+		test_skip("kernel config doesn't have ftrace - no checks");
+		return;
+	}
+
+	nr = tracer_get_savedlines_nr(tracer);
+	lines = tracer_get_savedlines(tracer);
+	print_match_stats(!!nr);
+	if (!nr)
+		return;
+
+	errno = 0;
+	test_xfail("Trace events [%zu] were not expected:", nr);
+	while (nr)
+		dump_events("\t%s", lines[--nr]);
+}
+
+static int setup_tcp_trace_events(struct test_ftracer *tracer)
+{
+	char *filter;
+	size_t i;
+	int ret;
+
+	filter = test_sprintf("net_cookie == %zu || net_cookie == %zu",
+			      ns_cookie1, ns_cookie2);
+	if (!filter)
+		return -ENOMEM;
+
+	for (i = 0; i < __MAX_TRACE_EVENTS; i++) {
+		char *event_name = test_sprintf("tcp/%s", trace_event_names[i]);
+
+		if (!event_name) {
+			ret = -ENOMEM;
+			break;
+		}
+		ret = setup_trace_event(tracer, event_name, filter);
+		free(event_name);
+		if (ret)
+			break;
+	}
+
+	free(filter);
+	return ret;
+}
+
+static void aolib_tracer_destroy(struct test_ftracer *tracer)
+{
+	check_free_events(tracer);
+	free_expected_events();
+}
+
+static bool aolib_tracer_expecting_more(void)
+{
+	size_t i;
+
+	for (i = 0; i < exp_tps_nr; i++)
+		if (!exp_tps[i].matched)
+			return true;
+	return false;
+}
+
+int setup_aolib_ftracer(void)
+{
+	struct test_ftracer *f;
+
+	f = create_ftracer("aolib", aolib_tracer_process_event,
+			   aolib_tracer_destroy, aolib_tracer_expecting_more,
+			   DEFAULT_FTRACE_BUFFER_KB, DEFAULT_TRACER_LINES_ARR);
+	if (!f)
+		return -1;
+
+	return setup_tcp_trace_events(f);
+}
diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace.c
new file mode 100644
index 000000000000..e4d0b173bc94
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <inttypes.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include "../../../../../include/linux/kernel.h"
+#include "aolib.h"
+
+static char ftrace_path[] = "ksft-ftrace-XXXXXX";
+static bool ftrace_mounted;
+uint64_t ns_cookie1, ns_cookie2;
+
+struct test_ftracer {
+	pthread_t tracer_thread;
+	int	error;
+	char	*instance_path;
+	FILE	*trace_pipe;
+
+	enum ftracer_op (*process_line)(const char *line);
+	void (*destructor)(struct test_ftracer *tracer);
+	bool (*expecting_more)(void);
+
+	char	**saved_lines;
+	size_t	saved_lines_size;
+	size_t	next_line_ind;
+
+	pthread_cond_t met_all_expected;
+	pthread_mutex_t met_all_expected_lock;
+
+	struct test_ftracer *next;
+};
+
+static struct test_ftracer *ftracers;
+static pthread_mutex_t ftracers_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static int mount_ftrace(void)
+{
+	if (!mkdtemp(ftrace_path))
+		test_error("Can't create temp dir");
+
+	if (mount("tracefs", ftrace_path, "tracefs", 0, "rw"))
+		return -errno;
+
+	ftrace_mounted = true;
+
+	return 0;
+}
+
+static void unmount_ftrace(void)
+{
+	if (ftrace_mounted && umount(ftrace_path))
+		test_print("Failed on cleanup: can't unmount tracefs: %m");
+
+	if (rmdir(ftrace_path))
+		test_error("Failed on cleanup: can't remove ftrace dir %s",
+			   ftrace_path);
+}
+
+struct opts_list_t {
+	char *opt_name;
+	struct opts_list_t *next;
+};
+
+static int disable_trace_options(const char *ftrace_path)
+{
+	struct opts_list_t *opts_list = NULL;
+	char *fopts, *line = NULL;
+	size_t buf_len = 0;
+	ssize_t line_len;
+	int ret = 0;
+	FILE *opts;
+
+	fopts = test_sprintf("%s/%s", ftrace_path, "trace_options");
+	if (!fopts)
+		return -ENOMEM;
+
+	opts = fopen(fopts, "r+");
+	if (!opts) {
+		ret = -errno;
+		goto out_free;
+	}
+
+	while ((line_len = getline(&line, &buf_len, opts)) != -1) {
+		struct opts_list_t *tmp;
+
+		if (!strncmp(line, "no", 2))
+			continue;
+
+		tmp = malloc(sizeof(*tmp));
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto out_free_opts_list;
+		}
+		tmp->next = opts_list;
+		tmp->opt_name = test_sprintf("no%s", line);
+		if (!tmp->opt_name) {
+			ret = -ENOMEM;
+			free(tmp);
+			goto out_free_opts_list;
+		}
+		opts_list = tmp;
+	}
+
+	while (opts_list) {
+		struct opts_list_t *tmp = opts_list;
+
+		fseek(opts, 0, SEEK_SET);
+		fwrite(tmp->opt_name, 1, strlen(tmp->opt_name), opts);
+
+		opts_list = opts_list->next;
+		free(tmp->opt_name);
+		free(tmp);
+	}
+
+out_free_opts_list:
+	while (opts_list) {
+		struct opts_list_t *tmp = opts_list;
+
+		opts_list = opts_list->next;
+		free(tmp->opt_name);
+		free(tmp);
+	}
+	free(line);
+	fclose(opts);
+out_free:
+	free(fopts);
+	return ret;
+}
+
+static int setup_buffer_size(const char *ftrace_path, size_t sz)
+{
+	char *fbuf_size = test_sprintf("%s/buffer_size_kb", ftrace_path);
+	int ret;
+
+	if (!fbuf_size)
+		return -1;
+
+	ret = test_echo(fbuf_size, 0, "%zu", sz);
+	free(fbuf_size);
+	return ret;
+}
+
+static int setup_ftrace_instance(struct test_ftracer *tracer, const char *name)
+{
+	char *tmp;
+
+	tmp = test_sprintf("%s/instances/ksft-%s-XXXXXX", ftrace_path, name);
+	if (!tmp)
+		return -ENOMEM;
+
+	tracer->instance_path = mkdtemp(tmp);
+	if (!tracer->instance_path) {
+		free(tmp);
+		return -errno;
+	}
+
+	return 0;
+}
+
+static void remove_ftrace_instance(struct test_ftracer *tracer)
+{
+	if (rmdir(tracer->instance_path))
+		test_print("Failed on cleanup: can't remove ftrace instance %s",
+			   tracer->instance_path);
+	free(tracer->instance_path);
+}
+
+static void tracer_cleanup(void *arg)
+{
+	struct test_ftracer *tracer = arg;
+
+	fclose(tracer->trace_pipe);
+}
+
+static void tracer_set_error(struct test_ftracer *tracer, int error)
+{
+	if (!tracer->error)
+		tracer->error = error;
+}
+
+const size_t tracer_get_savedlines_nr(struct test_ftracer *tracer)
+{
+	return tracer->next_line_ind;
+}
+
+const char **tracer_get_savedlines(struct test_ftracer *tracer)
+{
+	return (const char **)tracer->saved_lines;
+}
+
+static void *tracer_thread_func(void *arg)
+{
+	struct test_ftracer *tracer = arg;
+
+	pthread_cleanup_push(tracer_cleanup, arg);
+
+	while (tracer->next_line_ind < tracer->saved_lines_size) {
+		char **lp = &tracer->saved_lines[tracer->next_line_ind];
+		enum ftracer_op op;
+		size_t buf_len = 0;
+		ssize_t line_len;
+
+		line_len = getline(lp, &buf_len, tracer->trace_pipe);
+		if (line_len == -1)
+			break;
+
+		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+		op = tracer->process_line(*lp);
+		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+
+		if (tracer->expecting_more) {
+			pthread_mutex_lock(&tracer->met_all_expected_lock);
+			if (!tracer->expecting_more())
+				pthread_cond_signal(&tracer->met_all_expected);
+			pthread_mutex_unlock(&tracer->met_all_expected_lock);
+		}
+
+		if (op == FTRACER_LINE_DISCARD)
+			continue;
+		if (op == FTRACER_EXIT)
+			break;
+		if (op != FTRACER_LINE_PRESERVE)
+			test_error("unexpected tracer command %d", op);
+
+		tracer->next_line_ind++;
+		buf_len = 0;
+	}
+	test_print("too many lines in ftracer buffer %zu, exiting tracer",
+		   tracer->next_line_ind);
+
+	pthread_cleanup_pop(1);
+	return NULL;
+}
+
+static int setup_trace_thread(struct test_ftracer *tracer)
+{
+	int ret = 0;
+	char *path;
+
+	path = test_sprintf("%s/trace_pipe", tracer->instance_path);
+	if (!path)
+		return -ENOMEM;
+
+	tracer->trace_pipe = fopen(path, "r");
+	if (!tracer->trace_pipe) {
+		ret = -errno;
+		goto out_free;
+	}
+
+	if (pthread_create(&tracer->tracer_thread, NULL,
+			   tracer_thread_func, (void *)tracer)) {
+		ret = -errno;
+		fclose(tracer->trace_pipe);
+	}
+
+out_free:
+	free(path);
+	return ret;
+}
+
+static void stop_trace_thread(struct test_ftracer *tracer)
+{
+	void *res;
+
+	if (pthread_cancel(tracer->tracer_thread)) {
+		test_print("Can't stop tracer pthread: %m");
+		tracer_set_error(tracer, -errno);
+	}
+	if (pthread_join(tracer->tracer_thread, &res)) {
+		test_print("Can't join tracer pthread: %m");
+		tracer_set_error(tracer, -errno);
+	}
+	if (res != PTHREAD_CANCELED) {
+		test_print("Tracer thread wasn't canceled");
+		tracer_set_error(tracer, -errno);
+	}
+	if (tracer->error)
+		test_fail("tracer errored by %s", strerror(tracer->error));
+}
+
+static void final_wait_for_events(struct test_ftracer *tracer,
+				  unsigned timeout_sec)
+{
+	struct timespec timeout;
+	struct timeval now;
+	int ret = 0;
+
+	if (!tracer->expecting_more)
+		return;
+
+	pthread_mutex_lock(&tracer->met_all_expected_lock);
+	gettimeofday(&now, NULL);
+	timeout.tv_sec = now.tv_sec + timeout_sec;
+	timeout.tv_nsec = now.tv_usec * 1000;
+
+	while (tracer->expecting_more() && ret != ETIMEDOUT)
+		ret = pthread_cond_timedwait(&tracer->met_all_expected,
+				&tracer->met_all_expected_lock, &timeout);
+	pthread_mutex_unlock(&tracer->met_all_expected_lock);
+}
+
+int setup_trace_event(struct test_ftracer *tracer,
+		      const char *event, const char *filter)
+{
+	char *enable_path, *filter_path, *instance = tracer->instance_path;
+	int ret;
+
+	enable_path = test_sprintf("%s/events/%s/enable", instance, event);
+	if (!enable_path)
+		return -ENOMEM;
+
+	filter_path = test_sprintf("%s/events/%s/filter", instance, event);
+	if (!filter_path) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	ret = test_echo(filter_path, 0, "%s", filter);
+	if (!ret)
+		ret = test_echo(enable_path, 0, "1");
+
+out_free:
+	free(filter_path);
+	free(enable_path);
+	return ret;
+}
+
+struct test_ftracer *create_ftracer(const char *name,
+				    enum ftracer_op (*process_line)(const char *line),
+				    void (*destructor)(struct test_ftracer *tracer),
+				    bool (*expecting_more)(void),
+				    size_t lines_buf_sz, size_t buffer_size_kb)
+{
+	struct test_ftracer *tracer;
+	int err;
+
+	/* XXX: separate __create_ftracer() helper and do here
+	 * if (!kernel_config_has(KCONFIG_FTRACE))
+	 *	return NULL;
+	 */
+
+	tracer = malloc(sizeof(*tracer));
+	if (!tracer) {
+		test_print("malloc()");
+		return NULL;
+	}
+
+	memset(tracer, 0, sizeof(*tracer));
+
+	err = setup_ftrace_instance(tracer, name);
+	if (err) {
+		test_print("setup_ftrace_instance(): %d", err);
+		goto err_free;
+	}
+
+	err = disable_trace_options(tracer->instance_path);
+	if (err) {
+		test_print("disable_trace_options(): %d", err);
+		goto err_remove;
+	}
+
+	err = setup_buffer_size(tracer->instance_path, buffer_size_kb);
+	if (err) {
+		test_print("disable_trace_options(): %d", err);
+		goto err_remove;
+	}
+
+	tracer->saved_lines = calloc(lines_buf_sz, sizeof(tracer->saved_lines[0]));
+	if (!tracer->saved_lines) {
+		test_print("calloc()");
+		goto err_remove;
+	}
+	tracer->saved_lines_size = lines_buf_sz;
+
+	tracer->process_line	= process_line;
+	tracer->destructor	= destructor;
+	tracer->expecting_more	= expecting_more;
+
+	err = pthread_cond_init(&tracer->met_all_expected, NULL);
+	if (err) {
+		test_print("pthread_cond_init(): %d", err);
+		goto err_free_lines;
+	}
+
+	err = pthread_mutex_init(&tracer->met_all_expected_lock, NULL);
+	if (err) {
+		test_print("pthread_mutex_init(): %d", err);
+		goto err_cond_destroy;
+	}
+
+	err = setup_trace_thread(tracer);
+	if (err) {
+		test_print("setup_trace_thread(): %d", err);
+		goto err_mutex_destroy;
+	}
+
+	pthread_mutex_lock(&ftracers_lock);
+	tracer->next = ftracers;
+	ftracers = tracer;
+	pthread_mutex_unlock(&ftracers_lock);
+
+	return tracer;
+
+err_mutex_destroy:
+	pthread_mutex_destroy(&tracer->met_all_expected_lock);
+err_cond_destroy:
+	pthread_cond_destroy(&tracer->met_all_expected);
+err_free_lines:
+	free(tracer->saved_lines);
+err_remove:
+	remove_ftrace_instance(tracer);
+err_free:
+	free(tracer);
+	return NULL;
+}
+
+static void __destroy_ftracer(struct test_ftracer *tracer)
+{
+	size_t i;
+
+	final_wait_for_events(tracer, TEST_TIMEOUT_SEC);
+	stop_trace_thread(tracer);
+	remove_ftrace_instance(tracer);
+	if (tracer->destructor)
+		tracer->destructor(tracer);
+	for (i = 0; i < tracer->saved_lines_size; i++)
+		free(tracer->saved_lines[i]);
+	pthread_cond_destroy(&tracer->met_all_expected);
+	pthread_mutex_destroy(&tracer->met_all_expected_lock);
+	free(tracer);
+}
+
+void destroy_ftracer(struct test_ftracer *tracer)
+{
+	pthread_mutex_lock(&ftracers_lock);
+	if (tracer == ftracers) {
+		ftracers = tracer->next;
+	} else {
+		struct test_ftracer *f = ftracers;
+
+		while (f->next != tracer) {
+			if (!f->next)
+				test_error("tracers list corruption or double free %p", tracer);
+			f = f->next;
+		}
+		f->next = tracer->next;
+	}
+	tracer->next = NULL;
+	pthread_mutex_unlock(&ftracers_lock);
+	__destroy_ftracer(tracer);
+}
+
+static void destroy_all_ftracers(void)
+{
+	struct test_ftracer *f;
+
+	pthread_mutex_lock(&ftracers_lock);
+	f = ftracers;
+	ftracers = NULL;
+	pthread_mutex_unlock(&ftracers_lock);
+
+	while (f) {
+		struct test_ftracer *n = f->next;
+
+		f->next = NULL;
+		__destroy_ftracer(f);
+		f = n;
+	}
+}
+
+static void test_unset_tracing(void)
+{
+	destroy_all_ftracers();
+	unmount_ftrace();
+}
+
+int test_setup_tracing(void)
+{
+	/*
+	 * Just a basic protection - this should be called only once from
+	 * lib/kconfig. Not thread safe, which is fine as it's early, before
+	 * threads are created.
+	 */
+	static int already_set;
+	int err;
+
+	if (already_set)
+		return -1;
+
+	/* Needs net-namespace cookies for filters */
+	if (ns_cookie1 == ns_cookie2) {
+		test_print("net-namespace cookies: %" PRIu64 " == %" PRIu64 ", can't set up tracing",
+			   ns_cookie1, ns_cookie2);
+		return -1;
+	}
+
+	already_set = 1;
+
+	test_add_destructor(test_unset_tracing);
+
+	err = mount_ftrace();
+	if (err) {
+		test_print("failed to mount_ftrace(): %d", err);
+		return err;
+	}
+
+	return setup_aolib_ftracer();
+}
+
+static int get_ns_cookie(int nsfd, uint64_t *out)
+{
+	int old_ns = switch_save_ns(nsfd);
+	socklen_t size = sizeof(*out);
+	int sk;
+
+	sk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (sk < 0) {
+		test_print("socket(): %m");
+		return -errno;
+	}
+
+	if (getsockopt(sk, SOL_SOCKET, SO_NETNS_COOKIE, out, &size)) {
+		test_print("getsockopt(SO_NETNS_COOKIE): %m");
+		close(sk);
+		return -errno;
+	}
+
+	close(sk);
+	switch_close_ns(old_ns);
+	return 0;
+}
+
+void test_init_ftrace(int nsfd1, int nsfd2)
+{
+	get_ns_cookie(nsfd1, &ns_cookie1);
+	get_ns_cookie(nsfd2, &ns_cookie2);
+	/* Populate kernel config state */
+	kernel_config_has(KCONFIG_FTRACE);
+}
diff --git a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
index 3bf4a7e4b3c9..9f1c175846f8 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
@@ -116,6 +116,12 @@ static int has_vrfs(int *err)
 	return ret;
 }
 
+static int has_ftrace(int *err)
+{
+	*err = test_setup_tracing();
+	return 0;
+}
+
 #define KCONFIG_UNKNOWN			1
 static pthread_mutex_t kconfig_lock = PTHREAD_MUTEX_INITIALIZER;
 static struct kconfig_t kconfig[__KCONFIG_LAST__] = {
@@ -124,6 +130,7 @@ static struct kconfig_t kconfig[__KCONFIG_LAST__] = {
 	{ KCONFIG_UNKNOWN, has_tcp_ao },
 	{ KCONFIG_UNKNOWN, has_tcp_md5 },
 	{ KCONFIG_UNKNOWN, has_vrfs },
+	{ KCONFIG_UNKNOWN, has_ftrace },
 };
 
 const char *tests_skip_reason[__KCONFIG_LAST__] = {
@@ -132,6 +139,7 @@ const char *tests_skip_reason[__KCONFIG_LAST__] = {
 	"Tests require TCP-AO support (CONFIG_TCP_AO)",
 	"setsockopt(TCP_MD5SIG_EXT) is not supported (CONFIG_TCP_MD5)",
 	"VRFs are not supported (CONFIG_NET_VRF)",
+	"Ftrace points are not supported (CONFIG_TRACEPOINTS)",
 };
 
 bool kernel_config_has(enum test_needs_kconfig k)
diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c
index 86a4f6e20450..a27cc03c9fbd 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/setup.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c
@@ -250,9 +250,9 @@ void __test_init(unsigned int ntests, int family, unsigned int prefix,
 	test_print("rand seed %u", (unsigned int)seed);
 	srand(seed);
 
-
 	ksft_print_header();
 	init_namespaces();
+	test_init_ftrace(nsfd_parent, nsfd_child);
 
 	if (add_veth(veth_name, nsfd_parent, nsfd_child))
 		test_error("Failed to add veth");
diff --git a/tools/testing/selftests/net/tcp_ao/lib/utils.c b/tools/testing/selftests/net/tcp_ao/lib/utils.c
index 372daca525f5..bdf5522c9213 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/utils.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/utils.c
@@ -21,6 +21,32 @@ void randomize_buffer(void *buf, size_t buflen)
 	}
 }
 
+__printf(3, 4) int test_echo(const char *fname, bool append,
+			     const char *fmt, ...)
+{
+	size_t len, written;
+	va_list vargs;
+	char *msg;
+	FILE *f;
+
+	f = fopen(fname, append ? "a" : "w");
+	if (!f)
+		return -errno;
+
+	va_start(vargs, fmt);
+	msg = test_snprintf(fmt, vargs);
+	va_end(vargs);
+	if (!msg) {
+		fclose(f);
+		return -1;
+	}
+	len = strlen(msg);
+	written = fwrite(msg, 1, len, f);
+	fclose(f);
+	free(msg);
+	return written == len ? 0 : -1;
+}
+
 const struct sockaddr_in6 addr_any6 = {
 	.sin6_family	= AF_INET6,
 };
diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c
index f6ea2190f43d..ecc6f1e3a414 100644
--- a/tools/testing/selftests/net/tcp_ao/restore.c
+++ b/tools/testing/selftests/net/tcp_ao/restore.c
@@ -208,22 +208,36 @@ static void *client_fn(void *arg)
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
 	ao_img.snt_isn += 1;
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest,
+			      -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr,
+			      port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1);
 	test_sk_restore("TCP-AO with wrong send ISN", port++,
 			&saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD);
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
 	ao_img.rcv_isn += 1;
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest,
+			      -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr,
+			      port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1);
 	test_sk_restore("TCP-AO with wrong receive ISN", port++,
 			&saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD);
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
 	ao_img.snd_sne += 1;
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest,
+			      -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	/* not expecting server => client mismatches as only snd sne is broken */
 	test_sk_restore("TCP-AO with wrong send SEQ ext number", port++,
 			&saddr, &tcp_img, &ao_img, FAULT_TIMEOUT,
 			TEST_CNT_NS_BAD | TEST_CNT_GOOD);
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
 	ao_img.rcv_sne += 1;
+	/* not expecting client => server mismatches as only rcv sne is broken */
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr,
+			      port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1);
 	test_sk_restore("TCP-AO with wrong receive SEQ ext number", port++,
 			&saddr, &tcp_img, &ao_img, FAULT_TIMEOUT,
 			TEST_CNT_NS_GOOD | TEST_CNT_BAD);
@@ -233,6 +247,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(20, server_fn, client_fn);
+	test_init(21, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c
index a2fe88d35ac0..6364facaa63e 100644
--- a/tools/testing/selftests/net/tcp_ao/rst.c
+++ b/tools/testing/selftests/net/tcp_ao/rst.c
@@ -455,6 +455,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(14, server_fn, client_fn);
+	test_init(15, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c
index e56931d38e06..3ecd2b58de6a 100644
--- a/tools/testing/selftests/net/tcp_ao/self-connect.c
+++ b/tools/testing/selftests/net/tcp_ao/self-connect.c
@@ -163,17 +163,26 @@ static void *client_fn(void *arg)
 	setup_lo_intf("lo");
 
 	tcp_self_connect("self-connect(same keyids)", port++, false, false);
+
+	/* expecting rnext to change based on the first segment RNext != Current */
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
+			      port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1);
 	tcp_self_connect("self-connect(different keyids)", port++, true, false);
 	tcp_self_connect("self-connect(restore)", port, false, true);
-	port += 2;
+	port += 2; /* restore test restores over different port */
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
+			      port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1);
+	/* intentionally on restore they are added to the socket in different order */
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
+			      port + 1, port + 1, 0, -1, -1, -1, -1, -1, 5, 7, -1);
 	tcp_self_connect("self-connect(restore, different keyids)", port, true, true);
-	port += 2;
+	port += 2; /* restore test restores over different port */
 
 	return NULL;
 }
 
 int main(int argc, char *argv[])
 {
-	test_init(4, client_fn, NULL);
+	test_init(5, client_fn, NULL);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c
index 885866cc193c..8901a6785dc8 100644
--- a/tools/testing/selftests/net/tcp_ao/seq-ext.c
+++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c
@@ -116,6 +116,14 @@ static void *server_fn(void *arg)
 	sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest,
 			     client_new_port, &ao1);
 
+	trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_addr,
+			this_ip_dest, test_server_port + 1, client_new_port, 1);
+	trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_dest,
+			this_ip_addr, client_new_port, test_server_port + 1, 1);
+	trace_ao_event_sne_expect(TCP_AO_RCV_SNE_UPDATE, this_ip_addr,
+			this_ip_dest, test_server_port + 1, client_new_port, 1);
+	trace_ao_event_sne_expect(TCP_AO_RCV_SNE_UPDATE, this_ip_dest,
+			this_ip_addr, client_new_port, test_server_port + 1, 1);
 	synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */
 	bytes = test_server_run(sk, quota, TEST_TIMEOUT_SEC);
 	if (bytes != quota) {
@@ -242,6 +250,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(7, server_fn, client_fn);
+	test_init(8, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
index 5eee826c37aa..084db4ecdff6 100644
--- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
+++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
@@ -830,6 +830,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(120, client_fn, NULL);
+	test_init(121, client_fn, NULL);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
index 02346b58efbd..f779e5892bc1 100644
--- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
+++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
@@ -674,24 +674,38 @@ static void *client_fn(void *arg)
 
 	try_connect("AO server (INADDR_ANY): AO client", port++, NULL, 0,
 		    &addr_any, 0, 100, 100, 0, 0, 0, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO server (INADDR_ANY): MD5 client", port++, &addr_any, 0,
 		    NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_AO_REQUIRED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO server (INADDR_ANY): unsigned client", port++, NULL, 0,
 		    NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &this_ip_addr);
 	try_connect("AO server (AO_REQUIRED): AO client", port++, NULL, 0,
 		    &addr_any, 0, 100, 100, 0, 0, 0, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_AO_REQUIRED, client2,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO server (AO_REQUIRED): unsigned client", port++, NULL, 0,
 		    NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &client2);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("MD5 server (INADDR_ANY): AO client", port++, NULL, 0,
 		   &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr);
 	try_connect("MD5 server (INADDR_ANY): MD5 client", port++, &addr_any, 0,
 		   NULL, 0, 100, 100, 0, 0, 1, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_MD5_REQUIRED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("MD5 server (INADDR_ANY): no sign client", port++, NULL, 0,
 		   NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("no sign server: AO client", port++, NULL, 0,
 		   &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("no sign server: MD5 client", port++, &addr_any, 0,
 		   NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr);
 	try_connect("no sign server: no sign client", port++, NULL, 0,
@@ -699,25 +713,37 @@ static void *client_fn(void *arg)
 
 	try_connect("AO+MD5 server: AO client (matching)", port++, NULL, 0,
 		   &addr_any, 0, 100, 100, 0, 0, 1, &client2);
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("AO+MD5 server: AO client (misconfig, matching MD5)",
 		   port++, NULL, 0, &addr_any, 0, 100, 100, 0,
 		   FAULT_TIMEOUT, 1, &this_ip_addr);
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, client3, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("AO+MD5 server: AO client (misconfig, non-matching)",
 		   port++, NULL, 0, &addr_any, 0, 100, 100, 0,
 		   FAULT_TIMEOUT, 1, &client3);
 	try_connect("AO+MD5 server: MD5 client (matching)", port++, &addr_any, 0,
 		   NULL, 0, 100, 100, 0, 0, 1, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, client2,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO+MD5 server: MD5 client (misconfig, matching AO)",
 		   port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT,
 		   1, &client2);
+	trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, client3,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO+MD5 server: MD5 client (misconfig, non-matching)",
 		   port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT,
 		   1, &client3);
 	try_connect("AO+MD5 server: no sign client (unmatched)",
 		   port++, NULL, 0, NULL, 0, 100, 100, 0, 0, 1, &client3);
+	trace_hash_event_expect(TCP_HASH_AO_REQUIRED, client2,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO+MD5 server: no sign client (misconfig, matching AO)",
 		   port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT,
 		   1, &client2);
+	trace_hash_event_expect(TCP_HASH_MD5_REQUIRED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO+MD5 server: no sign client (misconfig, matching MD5)",
 		   port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT,
 		   1, &this_ip_addr);
@@ -739,6 +765,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(72, server_fn, client_fn);
+	test_init(73, server_fn, client_fn);
 	return 0;
 }
-- 
cgit v1.2.3


From c264487e5410e5a72db8a414566ab7d144223e6c Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Tue, 20 Aug 2024 10:36:22 +0800
Subject: selftests/bpf: Fix incorrect parameters in NULL pointer checking

Smatch reported the following warning:
    ./tools/testing/selftests/bpf/testing_helpers.c:455 get_xlated_program()
    warn: variable dereferenced before check 'buf' (see line 454)

It seems correct,so let's modify it based on it's suggestion.

Actually,commit b23ed4d74c4d ("selftests/bpf: Fix invalid pointer
check in get_xlated_program()") fixed an issue in the test_verifier.c
once,but it was reverted this time.

Let's solve this issue with the minimal changes possible.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/all/1eb3732f-605a-479d-ba64-cd14250cbf91@stanley.mountain/
Fixes: b4b7a4099b8c ("selftests/bpf: Factor out get_xlated_program() helper")
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Link: https://lore.kernel.org/r/20240820023622.29190-1-hao.ge@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/testing_helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index c217e12bd9da..d3c3c3a24150 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -452,7 +452,7 @@ int get_xlated_program(int fd_prog, struct bpf_insn **buf, __u32 *cnt)
 
 	*cnt = xlated_prog_len / buf_element_size;
 	*buf = calloc(*cnt, buf_element_size);
-	if (!buf) {
+	if (!*buf) {
 		perror("can't allocate xlated program buffer");
 		return -ENOMEM;
 	}
-- 
cgit v1.2.3


From 4f3affe0abf5d5910dc469a1f63257629605d3c3 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 20 Aug 2024 11:32:02 -0700
Subject: perf hist: Don't set hpp_fmt_value for members in --no-group

Perf crashes as below when applying --no-group

  # perf record -e "{cache-misses,branches"} -b sleep 1
  # perf report --stdio --no-group
  free(): invalid next size (fast)
  Aborted (core dumped)
  #

In the __hpp__fmt(), only 1 hpp_fmt_value is allocated for the current
event when --no-group is applied.

However, the current implementation tries to assign the hists from all
members to the hpp_fmt_value, which exceeds the allocated memory.

Fixes: 8f6071a3dce40e69 ("perf hist: Simplify __hpp_fmt() using hpp_fmt_data")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240820183202.3174323-1-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/hist.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 5d1f04f66a5a..e5491995adf0 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -62,7 +62,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
 	struct evsel *pos;
 	char *buf = hpp->buf;
 	size_t size = hpp->size;
-	int i, nr_members = 1;
+	int i = 0, nr_members = 1;
 	struct hpp_fmt_value *values;
 
 	if (evsel__is_group_event(evsel))
@@ -72,16 +72,16 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
 	if (values == NULL)
 		return 0;
 
-	i = 0;
-	for_each_group_evsel(pos, evsel)
-		values[i++].hists = evsel__hists(pos);
-
+	values[0].hists = evsel__hists(evsel);
 	values[0].val = get_field(he);
 	values[0].samples = he->stat.nr_events;
 
 	if (evsel__is_group_event(evsel)) {
 		struct hist_entry *pair;
 
+		for_each_group_member(pos, evsel)
+			values[++i].hists = evsel__hists(pos);
+
 		list_for_each_entry(pair, &he->pairs.head, pairs.node) {
 			for (i = 0; i < nr_members; i++) {
 				if (values[i].hists != pair->hists)
-- 
cgit v1.2.3


From 0fe2b18ddc40bf8fbcd96dab651822dbadc1cd85 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 20 Aug 2024 08:45:02 -0700
Subject: perf bpf-filter: Support multiple events properly

So far it used tgid as a key to get the filter expressions in the
pinned filters map for regular users but it won't work well if the has
more than one filters at the same time.  Let's add the event id to the
key of the filter hash map so that it can identify the right filter
expression in the BPF program.

As the event can be inherited to child tasks, it should use the primary
id which belongs to the parent (original) event.  Since evsel opens the
event for multiple CPUs and tasks, it needs to maintain a separate hash
map for the event id.

In the user space, it keeps a list for the multiple evsel and release
the entries in the both hash map when it closes the event.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240820154504.128923-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-filter.c                 | 288 ++++++++++++++++++++++++---
 tools/perf/util/bpf_skel/sample-filter.h     |  11 +-
 tools/perf/util/bpf_skel/sample_filter.bpf.c |  42 +++-
 tools/perf/util/bpf_skel/vmlinux/vmlinux.h   |   5 +
 4 files changed, 304 insertions(+), 42 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index c5eb0b7eec19..0a1832564dd2 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -1,4 +1,45 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Generic event filter for sampling events in BPF.
+ *
+ * The BPF program is fixed and just to read filter expressions in the 'filters'
+ * map and compare the sample data in order to reject samples that don't match.
+ * Each filter expression contains a sample flag (term) to compare, an operation
+ * (==, >=, and so on) and a value.
+ *
+ * Note that each entry has an array of filter expressions and it only succeeds
+ * when all of the expressions are satisfied.  But it supports the logical OR
+ * using a GROUP operation which is satisfied when any of its member expression
+ * is evaluated to true.  But it doesn't allow nested GROUP operations for now.
+ *
+ * To support non-root users, the filters map can be loaded and pinned in the BPF
+ * filesystem by root (perf record --setup-filter pin).  Then each user will get
+ * a new entry in the shared filters map to fill the filter expressions.  And the
+ * BPF program will find the filter using (task-id, event-id) as a key.
+ *
+ * The pinned BPF object (shared for regular users) has:
+ *
+ *                  event_hash                   |
+ *                  |        |                   |
+ *   event->id ---> |   id   | ---+   idx_hash   |     filters
+ *                  |        |    |   |      |   |    |       |
+ *                  |  ....  |    +-> |  idx | --+--> | exprs | --->  perf_bpf_filter_entry[]
+ *                                |   |      |   |    |       |               .op
+ *   task id (tgid) --------------+   | .... |   |    |  ...  |               .term (+ part)
+ *                                               |                            .value
+ *                                               |
+ *   ======= (root would skip this part) ========                     (compares it in a loop)
+ *
+ * This is used for per-task use cases while system-wide profiling (normally from
+ * root user) uses a separate copy of the program and the maps for its own so that
+ * it can proceed even if a lot of non-root users are using the filters at the
+ * same time.  In this case the filters map has a single entry and no need to use
+ * the hash maps to get the index (key) of the filters map (IOW it's always 0).
+ *
+ * The BPF program returns 1 to accept the sample or 0 to drop it.
+ * The 'dropped' map is to keep how many samples it dropped by the filter and
+ * it will be reported as lost samples.
+ */
 #include <stdlib.h>
 #include <fcntl.h>
 #include <sys/ioctl.h>
@@ -6,6 +47,7 @@
 
 #include <bpf/bpf.h>
 #include <linux/err.h>
+#include <linux/list.h>
 #include <api/fs/fs.h>
 #include <internal/xyarray.h>
 #include <perf/threadmap.h>
@@ -27,7 +69,14 @@
 #define PERF_SAMPLE_TYPE(_st, opt)	__PERF_SAMPLE_TYPE(PBF_TERM_##_st, PERF_SAMPLE_##_st, opt)
 
 /* Index in the pinned 'filters' map.  Should be released after use. */
-static int pinned_filter_idx = -1;
+struct pinned_filter_idx {
+	struct list_head list;
+	struct evsel *evsel;
+	u64 event_id;
+	int hash_idx;
+};
+
+static LIST_HEAD(pinned_filters);
 
 static const struct perf_sample_info {
 	enum perf_bpf_filter_term type;
@@ -175,24 +224,145 @@ static int convert_to_tgid(int tid)
 	return tgid;
 }
 
-static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *entry)
+/*
+ * The event might be closed already so we cannot get the list of ids using FD
+ * like in create_event_hash() below, let's iterate the event_hash map and
+ * delete all entries that have the event id as a key.
+ */
+static void destroy_event_hash(u64 event_id)
+{
+	int fd;
+	u64 key, *prev_key = NULL;
+	int num = 0, alloced = 32;
+	u64 *ids = calloc(alloced, sizeof(*ids));
+
+	if (ids == NULL)
+		return;
+
+	fd = get_pinned_fd("event_hash");
+	if (fd < 0) {
+		pr_debug("cannot get fd for 'event_hash' map\n");
+		free(ids);
+		return;
+	}
+
+	/* Iterate the whole map to collect keys for the event id. */
+	while (!bpf_map_get_next_key(fd, prev_key, &key)) {
+		u64 id;
+
+		if (bpf_map_lookup_elem(fd, &key, &id) == 0 && id == event_id) {
+			if (num == alloced) {
+				void *tmp;
+
+				alloced *= 2;
+				tmp = realloc(ids, alloced * sizeof(*ids));
+				if (tmp == NULL)
+					break;
+
+				ids = tmp;
+			}
+			ids[num++] = key;
+		}
+
+		prev_key = &key;
+	}
+
+	for (int i = 0; i < num; i++)
+		bpf_map_delete_elem(fd, &ids[i]);
+
+	free(ids);
+	close(fd);
+}
+
+/*
+ * Return a representative id if ok, or 0 for failures.
+ *
+ * The perf_event->id is good for this, but an evsel would have multiple
+ * instances for CPUs and tasks.  So pick up the first id and setup a hash
+ * from id of each instance to the representative id (the first one).
+ */
+static u64 create_event_hash(struct evsel *evsel)
+{
+	int x, y, fd;
+	u64 the_id = 0, id;
+
+	fd = get_pinned_fd("event_hash");
+	if (fd < 0) {
+		pr_err("cannot get fd for 'event_hash' map\n");
+		return 0;
+	}
+
+	for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) {
+		for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) {
+			int ret = ioctl(FD(evsel, x, y), PERF_EVENT_IOC_ID, &id);
+
+			if (ret < 0) {
+				pr_err("Failed to get the event id\n");
+				if (the_id)
+					destroy_event_hash(the_id);
+				return 0;
+			}
+
+			if (the_id == 0)
+				the_id = id;
+
+			bpf_map_update_elem(fd, &id, &the_id, BPF_ANY);
+		}
+	}
+
+	close(fd);
+	return the_id;
+}
+
+static void destroy_idx_hash(struct pinned_filter_idx *pfi)
+{
+	int fd, nr;
+	struct perf_thread_map *threads;
+
+	fd = get_pinned_fd("filters");
+	bpf_map_delete_elem(fd, &pfi->hash_idx);
+	close(fd);
+
+	if (pfi->event_id)
+		destroy_event_hash(pfi->event_id);
+
+	threads = perf_evsel__threads(&pfi->evsel->core);
+	if (threads == NULL)
+		return;
+
+	fd = get_pinned_fd("idx_hash");
+	nr = perf_thread_map__nr(threads);
+	for (int i = 0; i < nr; i++) {
+		/* The target task might be dead already, just try the pid */
+		struct idx_hash_key key = {
+			.evt_id = pfi->event_id,
+			.tgid = perf_thread_map__pid(threads, i),
+		};
+
+		bpf_map_delete_elem(fd, &key);
+	}
+	close(fd);
+}
+
+/* Maintain a hashmap from (tgid, event-id) to filter index */
+static int create_idx_hash(struct evsel *evsel, struct perf_bpf_filter_entry *entry)
 {
 	int filter_idx;
 	int fd, nr, last;
+	u64 event_id = 0;
+	struct pinned_filter_idx *pfi = NULL;
 	struct perf_thread_map *threads;
 
 	fd = get_pinned_fd("filters");
 	if (fd < 0) {
-		pr_debug("cannot get fd for 'filters' map\n");
+		pr_err("cannot get fd for 'filters' map\n");
 		return fd;
 	}
 
 	/* Find the first available entry in the filters map */
 	for (filter_idx = 0; filter_idx < MAX_FILTERS; filter_idx++) {
-		if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0) {
-			pinned_filter_idx = filter_idx;
+		if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0)
 			break;
-		}
 	}
 	close(fd);
 
@@ -201,22 +371,44 @@ static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *en
 		return -EBUSY;
 	}
 
+	pfi = zalloc(sizeof(*pfi));
+	if (pfi == NULL) {
+		pr_err("Cannot save pinned filter index\n");
+		goto err;
+	}
+
+	pfi->evsel = evsel;
+	pfi->hash_idx = filter_idx;
+
+	event_id = create_event_hash(evsel);
+	if (event_id == 0) {
+		pr_err("Cannot update the event hash\n");
+		goto err;
+	}
+
+	pfi->event_id = event_id;
+
 	threads = perf_evsel__threads(&evsel->core);
 	if (threads == NULL) {
 		pr_err("Cannot get the thread list of the event\n");
-		return -EINVAL;
+		goto err;
 	}
 
 	/* save the index to a hash map */
-	fd = get_pinned_fd("pid_hash");
-	if (fd < 0)
-		return fd;
+	fd = get_pinned_fd("idx_hash");
+	if (fd < 0) {
+		pr_err("cannot get fd for 'idx_hash' map\n");
+		goto err;
+	}
 
 	last = -1;
 	nr = perf_thread_map__nr(threads);
 	for (int i = 0; i < nr; i++) {
 		int pid = perf_thread_map__pid(threads, i);
 		int tgid;
+		struct idx_hash_key key = {
+			.evt_id = event_id,
+		};
 
 		/* it actually needs tgid, let's get tgid from /proc. */
 		tgid = convert_to_tgid(pid);
@@ -228,16 +420,25 @@ static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *en
 		if (tgid == last)
 			continue;
 		last = tgid;
+		key.tgid = tgid;
 
-		if (bpf_map_update_elem(fd, &tgid, &filter_idx, BPF_ANY) < 0) {
-			pr_err("Failed to update the pid hash\n");
+		if (bpf_map_update_elem(fd, &key, &filter_idx, BPF_ANY) < 0) {
+			pr_err("Failed to update the idx_hash\n");
 			close(fd);
-			return -1;
+			goto err;
 		}
-		pr_debug("pid hash: %d -> %d\n", tgid, filter_idx);
+		pr_debug("bpf-filter: idx_hash (task=%d,%s) -> %d\n",
+			 tgid, evsel__name(evsel), filter_idx);
 	}
+
+	list_add(&pfi->list, &pinned_filters);
 	close(fd);
-	return 0;
+	return filter_idx;
+
+err:
+	destroy_idx_hash(pfi);
+	free(pfi);
+	return -1;
 }
 
 int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
@@ -247,7 +448,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
 	struct bpf_program *prog;
 	struct bpf_link *link;
 	struct perf_bpf_filter_entry *entry;
-	bool needs_pid_hash = !target__has_cpu(target) && !target->uid_str;
+	bool needs_idx_hash = !target__has_cpu(target) && !target->uid_str;
 
 	entry = calloc(MAX_FILTERS, sizeof(*entry));
 	if (entry == NULL)
@@ -259,11 +460,11 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
 		goto err;
 	}
 
-	if (needs_pid_hash && geteuid() != 0) {
+	if (needs_idx_hash && geteuid() != 0) {
 		int zero = 0;
 
 		/* The filters map is shared among other processes */
-		ret = update_pid_hash(evsel, entry);
+		ret = create_idx_hash(evsel, entry);
 		if (ret < 0)
 			goto err;
 
@@ -274,7 +475,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
 		}
 
 		/* Reset the lost count */
-		bpf_map_update_elem(fd, &pinned_filter_idx, &zero, BPF_ANY);
+		bpf_map_update_elem(fd, &ret, &zero, BPF_ANY);
 		close(fd);
 
 		fd = get_pinned_fd("perf_sample_filter");
@@ -288,6 +489,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
 				ret = ioctl(FD(evsel, x, y), PERF_EVENT_IOC_SET_BPF, fd);
 				if (ret < 0) {
 					pr_err("Failed to attach perf sample-filter\n");
+					close(fd);
 					goto err;
 				}
 			}
@@ -332,6 +534,15 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
 
 err:
 	free(entry);
+	if (!list_empty(&pinned_filters)) {
+		struct pinned_filter_idx *pfi, *tmp;
+
+		list_for_each_entry_safe(pfi, tmp, &pinned_filters, list) {
+			destroy_idx_hash(pfi);
+			list_del(&pfi->list);
+			free(pfi);
+		}
+	}
 	sample_filter_bpf__destroy(skel);
 	return ret;
 }
@@ -339,6 +550,7 @@ err:
 int perf_bpf_filter__destroy(struct evsel *evsel)
 {
 	struct perf_bpf_filter_expr *expr, *tmp;
+	struct pinned_filter_idx *pfi, *pos;
 
 	list_for_each_entry_safe(expr, tmp, &evsel->bpf_filters, list) {
 		list_del(&expr->list);
@@ -346,14 +558,11 @@ int perf_bpf_filter__destroy(struct evsel *evsel)
 	}
 	sample_filter_bpf__destroy(evsel->bpf_skel);
 
-	if (pinned_filter_idx >= 0) {
-		int fd = get_pinned_fd("filters");
-
-		bpf_map_delete_elem(fd, &pinned_filter_idx);
-		pinned_filter_idx = -1;
-		close(fd);
+	list_for_each_entry_safe(pfi, pos, &pinned_filters, list) {
+		destroy_idx_hash(pfi);
+		list_del(&pfi->list);
+		free(pfi);
 	}
-
 	return 0;
 }
 
@@ -364,10 +573,20 @@ u64 perf_bpf_filter__lost_count(struct evsel *evsel)
 	if (list_empty(&evsel->bpf_filters))
 		return 0;
 
-	if (pinned_filter_idx >= 0) {
+	if (!list_empty(&pinned_filters)) {
 		int fd = get_pinned_fd("dropped");
+		struct pinned_filter_idx *pfi;
+
+		if (fd < 0)
+			return 0;
 
-		bpf_map_lookup_elem(fd, &pinned_filter_idx, &count);
+		list_for_each_entry(pfi, &pinned_filters, list) {
+			if (pfi->evsel != evsel)
+				continue;
+
+			bpf_map_lookup_elem(fd, &pfi->hash_idx, &count);
+			break;
+		}
 		close(fd);
 	} else if (evsel->bpf_skel) {
 		struct sample_filter_bpf *skel = evsel->bpf_skel;
@@ -429,9 +648,10 @@ int perf_bpf_filter__pin(void)
 
 	/* pinned program will use pid-hash */
 	bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS);
-	bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS);
+	bpf_map__set_max_entries(skel->maps.event_hash, MAX_EVT_HASH);
+	bpf_map__set_max_entries(skel->maps.idx_hash, MAX_IDX_HASH);
 	bpf_map__set_max_entries(skel->maps.dropped, MAX_FILTERS);
-	skel->rodata->use_pid_hash = 1;
+	skel->rodata->use_idx_hash = 1;
 
 	if (sample_filter_bpf__load(skel) < 0) {
 		ret = -errno;
@@ -484,8 +704,12 @@ int perf_bpf_filter__pin(void)
 		pr_debug("chmod for filters failed\n");
 		ret = -errno;
 	}
-	if (fchmodat(dir_fd, "pid_hash", 0666, 0) < 0) {
-		pr_debug("chmod for pid_hash failed\n");
+	if (fchmodat(dir_fd, "event_hash", 0666, 0) < 0) {
+		pr_debug("chmod for event_hash failed\n");
+		ret = -errno;
+	}
+	if (fchmodat(dir_fd, "idx_hash", 0666, 0) < 0) {
+		pr_debug("chmod for idx_hash failed\n");
 		ret = -errno;
 	}
 	if (fchmodat(dir_fd, "dropped", 0666, 0) < 0) {
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
index e666bfd5fbdd..5f0c8e4e83d3 100644
--- a/tools/perf/util/bpf_skel/sample-filter.h
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -1,8 +1,9 @@
 #ifndef PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
 #define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
 
-#define MAX_FILTERS  64
-#define MAX_PIDS     (16 * 1024)
+#define MAX_FILTERS   64
+#define MAX_IDX_HASH  (16 * 1024)
+#define MAX_EVT_HASH  (1024 * 1024)
 
 /* supported filter operations */
 enum perf_bpf_filter_op {
@@ -62,4 +63,10 @@ struct perf_bpf_filter_entry {
 	__u64 value;
 };
 
+struct idx_hash_key {
+	__u64 evt_id;
+	__u32 tgid;
+	__u32 reserved;
+};
+
 #endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index 4c75354b84fd..4872a16eedfd 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -15,13 +15,25 @@ struct filters {
 	__uint(max_entries, 1);
 } filters SEC(".maps");
 
-/* tgid to filter index */
-struct pid_hash {
+/*
+ * An evsel has multiple instances for each CPU or task but we need a single
+ * id to be used as a key for the idx_hash.  This hashmap would translate the
+ * instance's ID to a representative ID.
+ */
+struct event_hash {
 	__uint(type, BPF_MAP_TYPE_HASH);
-	__type(key, int);
+	__type(key, __u64);
+	__type(value, __u64);
+	__uint(max_entries, 1);
+} event_hash SEC(".maps");
+
+/* tgid/evtid to filter index */
+struct idx_hash {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, struct idx_hash_key);
 	__type(value, int);
 	__uint(max_entries, 1);
-} pid_hash SEC(".maps");
+} idx_hash SEC(".maps");
 
 /* tgid to filter index */
 struct lost_count {
@@ -31,7 +43,7 @@ struct lost_count {
 	__uint(max_entries, 1);
 } dropped SEC(".maps");
 
-volatile const int use_pid_hash;
+volatile const int use_idx_hash;
 
 void *bpf_cast_to_kern_ctx(void *) __ksym;
 
@@ -202,11 +214,25 @@ int perf_sample_filter(void *ctx)
 
 	k = 0;
 
-	if (use_pid_hash) {
-		int tgid = bpf_get_current_pid_tgid() >> 32;
+	if (use_idx_hash) {
+		struct idx_hash_key key = {
+			.tgid = bpf_get_current_pid_tgid() >> 32,
+		};
+		__u64 eid = kctx->event->id;
+		__u64 *key_id;
 		int *idx;
 
-		idx = bpf_map_lookup_elem(&pid_hash, &tgid);
+		/* get primary_event_id */
+		if (kctx->event->parent)
+			eid = kctx->event->parent->id;
+
+		key_id = bpf_map_lookup_elem(&event_hash, &eid);
+		if (key_id == NULL)
+			goto drop;
+
+		key.evt_id = *key_id;
+
+		idx = bpf_map_lookup_elem(&idx_hash, &key);
 		if (idx)
 			k = *idx;
 		else
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
index d818e30c5457..4fa21468487e 100644
--- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -175,6 +175,11 @@ struct perf_sample_data {
 	u64			 code_page_size;
 } __attribute__((__aligned__(64))) __attribute__((preserve_access_index));
 
+struct perf_event {
+	struct perf_event	*parent;
+	u64			id;
+} __attribute__((preserve_access_index));
+
 struct bpf_perf_event_data_kern {
 	struct perf_sample_data *data;
 	struct perf_event	*event;
-- 
cgit v1.2.3


From 1a5474a779798f6ba9d798469326ebfa6b4a8fae Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 20 Aug 2024 08:45:03 -0700
Subject: perf tools: Print lost samples due to BPF filter

Print the actual dropped sample count in the event stat.

  $ sudo perf record -o- -e cycles --filter 'period < 10000' \
      -e instructions --filter 'ip > 0x8000000000000000' perf test -w noploop | \
      perf report --stat -i-
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.058 MB - ]

  Aggregated stats:
                 TOTAL events:        469
                  MMAP events:        268  (57.1%)
                  COMM events:          2  ( 0.4%)
                  EXIT events:          1  ( 0.2%)
                SAMPLE events:         16  ( 3.4%)
                 MMAP2 events:         22  ( 4.7%)
          LOST_SAMPLES events:          2  ( 0.4%)
               KSYMBOL events:         89  (19.0%)
             BPF_EVENT events:         39  ( 8.3%)
                  ATTR events:          2  ( 0.4%)
        FINISHED_ROUND events:          1  ( 0.2%)
              ID_INDEX events:          1  ( 0.2%)
            THREAD_MAP events:          1  ( 0.2%)
               CPU_MAP events:          1  ( 0.2%)
          EVENT_UPDATE events:          2  ( 0.4%)
             TIME_CONV events:          1  ( 0.2%)
               FEATURE events:         20  ( 4.3%)
         FINISHED_INIT events:          1  ( 0.2%)
  cycles stats:
                SAMPLE events:          2
    LOST_SAMPLES (BPF) events:       4010
  instructions stats:
                SAMPLE events:         14
    LOST_SAMPLES (BPF) events:       3990

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240820154504.128923-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c    |  9 +++++++--
 tools/perf/ui/stdio/hist.c     |  4 ++--
 tools/perf/util/events_stats.h | 15 ++++++++++++++-
 tools/perf/util/hist.c         | 19 +++++++++++++++----
 tools/perf/util/hist.h         |  1 +
 tools/perf/util/machine.c      |  5 +++--
 tools/perf/util/session.c      |  5 +++--
 7 files changed, 45 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1643113616f4..c304d9b06190 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -795,8 +795,13 @@ static int count_lost_samples_event(const struct perf_tool *tool,
 
 	evsel = evlist__id2evsel(rep->session->evlist, sample->id);
 	if (evsel) {
-		hists__inc_nr_lost_samples(evsel__hists(evsel),
-					   event->lost_samples.lost);
+		struct hists *hists = evsel__hists(evsel);
+		u32 count = event->lost_samples.lost;
+
+		if (event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF)
+			hists__inc_nr_dropped_samples(hists, count);
+		else
+			hists__inc_nr_lost_samples(hists, count);
 	}
 	return 0;
 }
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 9372e8904d22..74b2c619c56c 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -913,11 +913,11 @@ size_t events_stats__fprintf(struct events_stats *stats, FILE *fp)
 			continue;
 
 		if (i && total) {
-			ret += fprintf(fp, "%16s events: %10d  (%4.1f%%)\n",
+			ret += fprintf(fp, "%20s events: %10d  (%4.1f%%)\n",
 				       name, stats->nr_events[i],
 				       100.0 * stats->nr_events[i] / total);
 		} else {
-			ret += fprintf(fp, "%16s events: %10d\n",
+			ret += fprintf(fp, "%20s events: %10d\n",
 				       name, stats->nr_events[i]);
 		}
 	}
diff --git a/tools/perf/util/events_stats.h b/tools/perf/util/events_stats.h
index f43e5b1a366a..eabd7913c309 100644
--- a/tools/perf/util/events_stats.h
+++ b/tools/perf/util/events_stats.h
@@ -18,7 +18,18 @@
  * PERF_RECORD_LOST_SAMPLES event. The number of lost-samples events is stored
  * in .nr_events[PERF_RECORD_LOST_SAMPLES] while total_lost_samples tells
  * exactly how many samples the kernel in fact dropped, i.e. it is the sum of
- * all struct perf_record_lost_samples.lost fields reported.
+ * all struct perf_record_lost_samples.lost fields reported without setting the
+ * misc field in the header.
+ *
+ * The BPF program can discard samples according to the filter expressions given
+ * by the user.  This number is kept in a BPF map and dumped at the end of perf
+ * record in a PERF_RECORD_LOST_SAMPLES event.  To differentiate it from other
+ * lost samples, perf tools sets PERF_RECORD_MISC_LOST_SAMPLES_BPF flag in the
+ * header.misc field.  The number of dropped-samples events is stored in
+ * .nr_events[PERF_RECORD_LOST_SAMPLES] while total_dropped_samples tells
+ * exactly how many samples the BPF program in fact dropped, i.e. it is the sum
+ * of all struct perf_record_lost_samples.lost fields reported with the misc
+ * field set in the header.
  *
  * The total_period is needed because by default auto-freq is used, so
  * multiplying nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
@@ -28,6 +39,7 @@
 struct events_stats {
 	u64 total_lost;
 	u64 total_lost_samples;
+	u64 total_dropped_samples;
 	u64 total_aux_lost;
 	u64 total_aux_partial;
 	u64 total_aux_collision;
@@ -48,6 +60,7 @@ struct hists_stats {
 	u32 nr_samples;
 	u32 nr_non_filtered_samples;
 	u32 nr_lost_samples;
+	u32 nr_dropped_samples;
 };
 
 void events_stats__inc(struct events_stats *stats, u32 type);
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 0f9ce2ee2c31..dadce2889e52 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -2385,6 +2385,11 @@ void hists__inc_nr_lost_samples(struct hists *hists, u32 lost)
 	hists->stats.nr_lost_samples += lost;
 }
 
+void hists__inc_nr_dropped_samples(struct hists *hists, u32 lost)
+{
+	hists->stats.nr_dropped_samples += lost;
+}
+
 static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
 						 struct hist_entry *pair)
 {
@@ -2729,18 +2734,24 @@ size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp)
 
 	evlist__for_each_entry(evlist, pos) {
 		struct hists *hists = evsel__hists(pos);
+		u64 total_samples = hists->stats.nr_samples;
+
+		total_samples += hists->stats.nr_lost_samples;
+		total_samples += hists->stats.nr_dropped_samples;
 
-		if (symbol_conf.skip_empty && !hists->stats.nr_samples &&
-		    !hists->stats.nr_lost_samples)
+		if (symbol_conf.skip_empty && total_samples == 0)
 			continue;
 
 		ret += fprintf(fp, "%s stats:\n", evsel__name(pos));
 		if (hists->stats.nr_samples)
-			ret += fprintf(fp, "%16s events: %10d\n",
+			ret += fprintf(fp, "%20s events: %10d\n",
 				       "SAMPLE", hists->stats.nr_samples);
 		if (hists->stats.nr_lost_samples)
-			ret += fprintf(fp, "%16s events: %10d\n",
+			ret += fprintf(fp, "%20s events: %10d\n",
 				       "LOST_SAMPLES", hists->stats.nr_lost_samples);
+		if (hists->stats.nr_dropped_samples)
+			ret += fprintf(fp, "%20s events: %10d\n",
+				       "LOST_SAMPLES (BPF)", hists->stats.nr_dropped_samples);
 	}
 
 	return ret;
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index deb1087c5948..7d7ae94b4b31 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -372,6 +372,7 @@ void hists__inc_stats(struct hists *hists, struct hist_entry *h);
 void hists__inc_nr_events(struct hists *hists);
 void hists__inc_nr_samples(struct hists *hists, bool filtered);
 void hists__inc_nr_lost_samples(struct hists *hists, u32 lost);
+void hists__inc_nr_dropped_samples(struct hists *hists, u32 lost);
 
 size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 		      int max_cols, float min_pcnt, FILE *fp,
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index cd79a830abae..5783b96fb988 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -642,8 +642,9 @@ int machine__process_lost_event(struct machine *machine __maybe_unused,
 int machine__process_lost_samples_event(struct machine *machine __maybe_unused,
 					union perf_event *event, struct perf_sample *sample)
 {
-	dump_printf(": id:%" PRIu64 ": lost samples :%" PRI_lu64 "\n",
-		    sample->id, event->lost_samples.lost);
+	dump_printf(": id:%" PRIu64 ": lost samples :%" PRI_lu64 "%s\n",
+		    sample->id, event->lost_samples.lost,
+		    event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF ? " (BPF)" : "");
 	return 0;
 }
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index d2bd563119bc..774eb3382000 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1290,8 +1290,9 @@ static int machines__deliver_event(struct machines *machines,
 			evlist->stats.total_lost += event->lost.lost;
 		return tool->lost(tool, event, sample, machine);
 	case PERF_RECORD_LOST_SAMPLES:
-		if (tool->lost_samples == perf_event__process_lost_samples &&
-		    !(event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF))
+		if (event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF)
+			evlist->stats.total_dropped_samples += event->lost_samples.lost;
+		else if (tool->lost_samples == perf_event__process_lost_samples)
 			evlist->stats.total_lost_samples += event->lost_samples.lost;
 		return tool->lost_samples(tool, event, sample, machine);
 	case PERF_RECORD_READ:
-- 
cgit v1.2.3


From 150ca9ccc4e94eaa07b9d3e07f45fec842b68a78 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 20 Aug 2024 08:45:04 -0700
Subject: perf test: Update sample filtering tests with multiple events

Add Multiple bpf-filter test for two or more events with filters.
It uses task-clock and page-faults events with different filter
expressions and check the perf script output

  $ sudo ./perf test filtering -vv
   96: perf record sample filtering (by BPF) tests:
  --- start ---
  test child forked, pid 2804025
  Checking BPF-filter privilege
  Basic bpf-filter test
  Basic bpf-filter test [Success]
  Failing bpf-filter test
  Error: task-clock event does not have PERF_SAMPLE_CPU
  Failing bpf-filter test [Success]
  Group bpf-filter test
  Error: task-clock event does not have PERF_SAMPLE_CPU
  Error: task-clock event does not have PERF_SAMPLE_CODE_PAGE_SIZE
  Group bpf-filter test [Success]
  Multiple bpf-filter test
  Multiple bpf-filter test [Success]
  ---- end(0) ----
   96: perf record sample filtering (by BPF) tests                     : Ok

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240820154504.128923-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record_bpf_filter.sh | 34 +++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record_bpf_filter.sh b/tools/perf/tests/shell/record_bpf_filter.sh
index c5882d620db7..4df33c15bfa7 100755
--- a/tools/perf/tests/shell/record_bpf_filter.sh
+++ b/tools/perf/tests/shell/record_bpf_filter.sh
@@ -116,6 +116,36 @@ test_bpf_filter_group() {
   echo "Group bpf-filter test [Success]"
 }
 
+test_bpf_filter_multi() {
+  echo "Multiple bpf-filter test"
+
+  if ! perf record -e task-clock --filter 'period > 100000' \
+       -e page-faults --filter 'ip < 0xffffffff00000000' \
+       -o "${perfdata}" true 2> /dev/null
+  then
+    echo "Multiple bpf-filter test [Failed record]"
+    err=1
+    return
+  fi
+
+  if ! perf script -i "${perfdata}" -F period,event | grep task-clock | \
+	  awk '{ if (int($1) <= 100000) { print $0; exit(1); } }'
+  then
+    echo "Multiple bpf-filter test [Failed task-clock period]"
+    err=1
+    return
+  fi
+
+  if perf script -i "${perfdata}" -F event,ip | grep page-fault | \
+	  grep 'ffffffff[0-9a-f]*'
+  then
+    echo "Multiple bpf-filter test [Failed page-faults ip]"
+    err=1
+    return
+  fi
+
+  echo "Multiple bpf-filter test [Success]"
+}
 
 test_bpf_filter_priv
 
@@ -131,5 +161,9 @@ if [ $err = 0 ]; then
   test_bpf_filter_group
 fi
 
+if [ $err = 0 ]; then
+  test_bpf_filter_multi
+fi
+
 cleanup
 exit $err
-- 
cgit v1.2.3


From a68080e1a21b3183ee804ae05726f6103e322348 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Aug 2024 15:57:33 -0300
Subject: perf test vfs_getname: Look for alternative line where to collect the
 pathname

The getname_flags() routine changed recently and thus the place where we
were getting the pathname is not probeable anymore, albeit still
present, so use the next line for that, before:

  root@number:/home/acme/git/perf-tools-next# perf test vfs_getname
   91: Add vfs_getname probe to get syscall args filenames             : FAILED!
   93: Use vfs_getname probe to get syscall args filenames             : FAILED!
  126: Check open filename arg using perf trace + vfs_getname          : FAILED!
  root@number:/home/acme/git/perf-tools-next#

Now tests 91 and 126 are passing, some more investigation is needed for
test 93, that continues to fail.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/lib/probe_vfs_getname.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/lib/probe_vfs_getname.sh b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
index bf4c1fb71c4b..b3802d9494b4 100644
--- a/tools/perf/tests/shell/lib/probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
@@ -13,7 +13,12 @@ cleanup_probe_vfs_getname() {
 add_probe_vfs_getname() {
 	add_probe_verbose=$1
 	if [ $had_vfs_getname -eq 1 ] ; then
-		line=$(perf probe -L getname_flags 2>&1 | grep -E 'result.*=.*filename;' | sed -r 's/[[:space:]]+([[:digit:]]+)[[:space:]]+result->uptr.*/\1/')
+		result_filename_re="[[:space:]]+([[:digit:]]+)[[:space:]]+result->uptr.*"
+		line=$(perf probe -L getname_flags 2>&1 | grep -E "$result_filename_re=" | sed -r "s/$result_filename_re=/\1/")
+		if [ -z "$line" ] ; then
+			result_aname_re="[[:space:]]+([[:digit:]]+)[[:space:]]+result->aname = NULL;"
+			line=$(perf probe -L getname_flags 2>&1 | grep -E "$result_aname_re=" | sed -r "s/$result_aname_re=/\1/")
+		fi
 		perf probe -q       "vfs_getname=getname_flags:${line} pathname=result->name:string" || \
 		perf probe $add_probe_verbose "vfs_getname=getname_flags:${line} pathname=filename:ustring"
 	fi
-- 
cgit v1.2.3


From 5a02447c8145b87373ada2040bbab5bc2a8f87be Mon Sep 17 00:00:00 2001
From: Michael Petlan <mpetlan@redhat.com>
Date: Tue, 2 Jul 2024 13:08:40 +0200
Subject: perf tests shell: Skip base_* dirs in test script search

The test scripts in base_* directories currently have their own drivers
that run them. Before this patch, the shell test-suite generator causes
them to run twice. Fix that by skipping them in the generator.

A cleaner solution (for future) will be to use the directory structure
idea (introduced by Carsten Haitzler in 7391db645938 ("perf test:
Refactor shell tests allowing subdirs")) to generate test entries with
subtests, like:

  $ perf test list
  [...]
  97: perf probe shell tests
  97:1: perf probe basic functionality
  97:2: perf probe tests with arguments
  97:3: perf probe invalid options handling
  [...]

There is already a lot of shell test scripts and many are about to come,
so there is a need for some hierarchy.

Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-3-vmolnaro@redhat.com
Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/tests-scripts.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/tests-scripts.c b/tools/perf/tests/tests-scripts.c
index 8dc1e398288c..ed114b044293 100644
--- a/tools/perf/tests/tests-scripts.c
+++ b/tools/perf/tests/tests-scripts.c
@@ -251,6 +251,8 @@ static void append_scripts_in_dir(int dir_fd,
 			if (!S_ISDIR(st.st_mode))
 				continue;
 		}
+		if (strncmp(ent->d_name, "base_", 5) == 0)
+			continue; /* Skip scripts that have a separate driver. */
 		fd = openat(dir_fd, ent->d_name, O_PATH);
 		append_scripts_in_dir(fd, result, result_sz);
 	}
-- 
cgit v1.2.3


From a3a02a52bcfcbcc4a637d4b68bf1bc391c9fad02 Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:41 +0200
Subject: perf testsuite: Merge settings files for shell tests

Merge perf testsuite setting files into common settings to reduce
duplicates and prevent errors.

Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-4-vmolnaro@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/base_probe/settings.sh      | 48 ----------------------
 .../tests/shell/base_probe/test_adding_kernel.sh   |  3 --
 tools/perf/tests/shell/common/init.sh              | 23 +++++++++++
 tools/perf/tests/shell/common/settings.sh          | 23 +++++++++++
 4 files changed, 46 insertions(+), 51 deletions(-)
 delete mode 100644 tools/perf/tests/shell/base_probe/settings.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/base_probe/settings.sh b/tools/perf/tests/shell/base_probe/settings.sh
deleted file mode 100644
index 123621c7f95e..000000000000
--- a/tools/perf/tests/shell/base_probe/settings.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-#	settings.sh of perf_probe test
-#	Author: Michael Petlan <mpetlan@redhat.com>
-#	Author: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
-#
-
-export TEST_NAME="perf_probe"
-
-export MY_ARCH=`arch`
-
-if [ -n "$PERFSUITE_RUN_DIR" ]; then
-	# when $PERFSUITE_RUN_DIR is set to something, all the logs and temp files will be placed there
-	# --> the $PERFSUITE_RUN_DIR/perf_something/examples and $PERFSUITE_RUN_DIR/perf_something/logs
-	#     dirs will be used for that
-	export PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR`
-	export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME"
-	export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples"
-	test -d "$MAKE_TARGET_DIR" || mkdir -p "$MAKE_TARGET_DIR"
-	export LOGS_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME/logs"
-	test -d "$LOGS_DIR" || mkdir -p "$LOGS_DIR"
-else
-	# when $PERFSUITE_RUN_DIR is not set, logs will be placed here
-	export CURRENT_TEST_DIR="."
-	export LOGS_DIR="."
-fi
-
-check_kprobes_available()
-{
-	test -e /sys/kernel/debug/tracing/kprobe_events
-}
-
-check_uprobes_available()
-{
-	test -e /sys/kernel/debug/tracing/uprobe_events
-}
-
-clear_all_probes()
-{
-	echo 0 > /sys/kernel/debug/tracing/events/enable
-	check_kprobes_available && echo > /sys/kernel/debug/tracing/kprobe_events
-	check_uprobes_available && echo > /sys/kernel/debug/tracing/uprobe_events
-}
-
-check_sdt_support()
-{
-	$CMD_PERF list sdt | grep sdt > /dev/null 2> /dev/null
-}
diff --git a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh
index 187dc8d4b163..d541ffd44a93 100755
--- a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh
+++ b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh
@@ -15,10 +15,7 @@
 
 # include working environment
 . ../common/init.sh
-. ./settings.sh
 
-# shellcheck disable=SC2034 # the variable is later used after the working environment is included
-THIS_TEST_NAME=`basename $0 .sh`
 TEST_RESULT=0
 
 # shellcheck source=lib/probe_vfs_getname.sh
diff --git a/tools/perf/tests/shell/common/init.sh b/tools/perf/tests/shell/common/init.sh
index aadeaf782e03..b99fba50ba94 100644
--- a/tools/perf/tests/shell/common/init.sh
+++ b/tools/perf/tests/shell/common/init.sh
@@ -115,3 +115,26 @@ detect_amd()
 	# 1 = is not AMD or unknown
 	grep "vendor_id" < /proc/cpuinfo | grep -q "AMD"
 }
+
+# base probe utility
+check_kprobes_available()
+{
+	test -e /sys/kernel/debug/tracing/kprobe_events
+}
+
+check_uprobes_available()
+{
+	test -e /sys/kernel/debug/tracing/uprobe_events
+}
+
+clear_all_probes()
+{
+	echo 0 > /sys/kernel/debug/tracing/events/enable
+	check_kprobes_available && echo > /sys/kernel/debug/tracing/kprobe_events
+	check_uprobes_available && echo > /sys/kernel/debug/tracing/uprobe_events
+}
+
+check_sdt_support()
+{
+	$CMD_PERF list sdt | grep sdt > /dev/null 2> /dev/null
+}
diff --git a/tools/perf/tests/shell/common/settings.sh b/tools/perf/tests/shell/common/settings.sh
index 361641dbaaad..dcdb7d49ac00 100644
--- a/tools/perf/tests/shell/common/settings.sh
+++ b/tools/perf/tests/shell/common/settings.sh
@@ -65,6 +65,29 @@ else
 	export MEND=""
 fi
 
+### general info
+DIR_PATH=`dirname "$(readlink -e "$0")"`
+
+export TEST_NAME=`basename $DIR_PATH | sed 's/base/perf/'`
+export MY_ARCH=`arch`
+
+# storing logs and temporary files variables
+if [ -n "$PERFSUITE_RUN_DIR" ]; then
+	# when $PERFSUITE_RUN_DIR is set to something, all the logs and temp files will be placed there
+	# --> the $PERFSUITE_RUN_DIR/perf_something/examples and $PERFSUITE_RUN_DIR/perf_something/logs
+	#     dirs will be used for that
+	export PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR`
+	export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME"
+	export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples"
+	export LOGS_DIR="$CURRENT_TEST_DIR/logs"
+	test -d "$CURRENT_TEST_DIR" || mkdir -p "$CURRENT_TEST_DIR"
+	test -d "$LOGS_DIR" || mkdir -p "$LOGS_DIR"
+else
+	# when $PERFSUITE_RUN_DIR is not set, logs will be placed here
+	export CURRENT_TEST_DIR="."
+	export LOGS_DIR="."
+fi
+
 
 #### test parametrization
 if [ ! -d ./common ]; then
-- 
cgit v1.2.3


From 32ddd082dcac111908c5f11e0617cd0f8ec10350 Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:42 +0200
Subject: perf testsuite: Fix shellcheck warnings

Shellcheck is becoming a standard when building perf to prevent
any unnecessary mistakes. Fix shellcheck warnings in perf testsuite.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-5-vmolnaro@redhat.com
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/common/init.sh     | 8 ++++----
 tools/perf/tests/shell/common/settings.sh | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/common/init.sh b/tools/perf/tests/shell/common/init.sh
index b99fba50ba94..075f17623c8e 100644
--- a/tools/perf/tests/shell/common/init.sh
+++ b/tools/perf/tests/shell/common/init.sh
@@ -26,8 +26,8 @@ print_results()
 	PERF_RETVAL="$1"; shift
 	CHECK_RETVAL="$1"; shift
 	FAILURE_REASON=""
-	TASK_COMMENT="$@"
-	if [ $PERF_RETVAL -eq 0 -a $CHECK_RETVAL -eq 0 ]; then
+	TASK_COMMENT="$*"
+	if [ $PERF_RETVAL -eq 0 ] && [ $CHECK_RETVAL -eq 0 ]; then
 		_echo "$MPASS-- [ PASS ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $TASK_COMMENT"
 		return 0
 	else
@@ -56,7 +56,7 @@ print_overall_results()
 
 print_testcase_skipped()
 {
-	TASK_COMMENT="$@"
+	TASK_COMMENT="$*"
 	_echo "$MSKIP-- [ SKIP ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $TASK_COMMENT :: testcase skipped"
 	return 0
 }
@@ -69,7 +69,7 @@ print_overall_skipped()
 
 print_warning()
 {
-	WARN_COMMENT="$@"
+	WARN_COMMENT="$*"
 	_echo "$MWARN-- [ WARN ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $WARN_COMMENT"
 	return 0
 }
diff --git a/tools/perf/tests/shell/common/settings.sh b/tools/perf/tests/shell/common/settings.sh
index dcdb7d49ac00..2e50dd0bff9b 100644
--- a/tools/perf/tests/shell/common/settings.sh
+++ b/tools/perf/tests/shell/common/settings.sh
@@ -45,7 +45,7 @@ export TEST_IGNORE_MISSING_PMU=${TEST_IGNORE_MISSING_PMU:-n}
 export LC_ALL=C
 
 #### colors
-if [ -t 1 -o "$TESTLOG_FORCE_COLOR" = "yes" ]; then
+if [ -t 1 ] || [ "$TESTLOG_FORCE_COLOR" = "yes" ]; then
 	export MPASS="\e[32m"
 	export MALLPASS="\e[1;32m"
 	export MFAIL="\e[31m"
@@ -68,15 +68,15 @@ fi
 ### general info
 DIR_PATH=`dirname "$(readlink -e "$0")"`
 
-export TEST_NAME=`basename $DIR_PATH | sed 's/base/perf/'`
-export MY_ARCH=`arch`
+TEST_NAME=`basename $DIR_PATH | sed 's/base/perf/'`; export TEST_NAME
+MY_ARCH=`arch`; export MY_ARCH
 
 # storing logs and temporary files variables
 if [ -n "$PERFSUITE_RUN_DIR" ]; then
 	# when $PERFSUITE_RUN_DIR is set to something, all the logs and temp files will be placed there
 	# --> the $PERFSUITE_RUN_DIR/perf_something/examples and $PERFSUITE_RUN_DIR/perf_something/logs
 	#     dirs will be used for that
-	export PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR`
+	PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR`; export PERFSUITE_RUN_DIR
 	export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME"
 	export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples"
 	export LOGS_DIR="$CURRENT_TEST_DIR/logs"
@@ -93,6 +93,7 @@ fi
 if [ ! -d ./common ]; then
 	# set parameters based on runmode
 	if [ -f ../common/parametrization.$PERFTOOL_TESTSUITE_RUNMODE.sh ]; then
+		# shellcheck source=/dev/null
 		. ../common/parametrization.$PERFTOOL_TESTSUITE_RUNMODE.sh
 	fi
 	# if some parameters haven't been set until now, set them to default
-- 
cgit v1.2.3


From def5480d63c1e84779d36bc2fe78382e3c5e1b12 Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:43 +0200
Subject: perf testsuite probe: Add test for blacklisted kprobes handling

Test perf probe interface. Blacklisted functions should be rejected
when there is an attempt to set a kprobe to them.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-6-vmolnaro@redhat.com
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../shell/base_probe/test_adding_blacklisted.sh    | 67 ++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100755 tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh b/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh
new file mode 100755
index 000000000000..b5dc10b2a738
--- /dev/null
+++ b/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+
+#
+#	test_adding_blacklisted of perf_probe test
+#	Author: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#
+#	Description:
+#
+#		Blacklisted functions should not be added successfully as probes,
+#	they must be skipped.
+#
+
+# include working environment
+. ../common/init.sh
+
+TEST_RESULT=0
+
+# skip if not supported
+BLACKFUNC=`head -n 1 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2`
+if [ -z "$BLACKFUNC" ]; then
+	print_overall_skipped
+	exit 0
+fi
+
+# remove all previously added probes
+clear_all_probes
+
+
+### adding blacklisted function
+
+# functions from blacklist should be skipped by perf probe
+! $CMD_PERF probe $BLACKFUNC > $LOGS_DIR/adding_blacklisted.log 2> $LOGS_DIR/adding_blacklisted.err
+PERF_EXIT_CODE=$?
+
+REGEX_SCOPE_FAIL="Failed to find scope of probe point"
+REGEX_SKIP_MESSAGE=" is blacklisted function, skip it\."
+REGEX_NOT_FOUND_MESSAGE="Probe point \'$BLACKFUNC\' not found."
+REGEX_ERROR_MESSAGE="Error: Failed to add events."
+REGEX_INVALID_ARGUMENT="Failed to write event: Invalid argument"
+REGEX_SYMBOL_FAIL="Failed to find symbol at $RE_ADDRESS"
+REGEX_OUT_SECTION="$BLACKFUNC is out of \.\w+, skip it"
+../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding blacklisted function $BLACKFUNC"
+(( TEST_RESULT += $? ))
+
+
+### listing not-added probe
+
+# blacklisted probes should NOT appear in perf-list output
+$CMD_PERF list probe:\* > $LOGS_DIR/adding_blacklisted_list.log
+PERF_EXIT_CODE=$?
+
+../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "List of pre-defined events" "Metric Groups:" < $LOGS_DIR/adding_blacklisted_list.log
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing blacklisted probe (should NOT be listed)"
+(( TEST_RESULT += $? ))
+
+
+# print overall results
+print_overall_results "$TEST_RESULT"
+exit $?
-- 
cgit v1.2.3


From adc1dd00dbc192a8811fab482d0e26b2914ba3e3 Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:44 +0200
Subject: perf testsuite probe: Add test for basic perf-probe options

Test basic behavior of perf-probe subcommand. It is run as a part of
perftool-testsuite_probe test case.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-7-vmolnaro@redhat.com
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/base_probe/test_basic.sh | 80 +++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100755 tools/perf/tests/shell/base_probe/test_basic.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/base_probe/test_basic.sh b/tools/perf/tests/shell/base_probe/test_basic.sh
new file mode 100755
index 000000000000..09669ec479f2
--- /dev/null
+++ b/tools/perf/tests/shell/base_probe/test_basic.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+
+#
+#	test_basic of perf_probe test
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#	Author: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+#
+#	Description:
+#
+#		This test tests basic functionality of perf probe command.
+#
+
+# include working environment
+. ../common/init.sh
+
+TEST_RESULT=0
+
+if ! check_kprobes_available; then
+	print_overall_skipped
+	exit 0
+fi
+
+
+### help message
+
+if [ "$PARAM_GENERAL_HELP_TEXT_CHECK" = "y" ]; then
+	# test that a help message is shown and looks reasonable
+	$CMD_PERF probe --help > $LOGS_DIR/basic_helpmsg.log 2> $LOGS_DIR/basic_helpmsg.err
+	PERF_EXIT_CODE=$?
+
+	../common/check_all_patterns_found.pl "PERF-PROBE" "NAME" "SYNOPSIS" "DESCRIPTION" "OPTIONS" "PROBE\s+SYNTAX" "PROBE\s+ARGUMENT" "LINE\s+SYNTAX" < $LOGS_DIR/basic_helpmsg.log
+	CHECK_EXIT_CODE=$?
+	../common/check_all_patterns_found.pl "LAZY\s+MATCHING" "FILTER\s+PATTERN" "EXAMPLES" "SEE\s+ALSO" < $LOGS_DIR/basic_helpmsg.log
+	(( CHECK_EXIT_CODE += $? ))
+	../common/check_all_patterns_found.pl "vmlinux" "module=" "source=" "verbose" "quiet" "add=" "del=" "list.*EVENT" "line=" "vars=" "externs" < $LOGS_DIR/basic_helpmsg.log
+	(( CHECK_EXIT_CODE += $? ))
+	../common/check_all_patterns_found.pl "no-inlines" "funcs.*FILTER" "filter=FILTER" "force" "dry-run" "max-probes" "exec=" "demangle-kernel" < $LOGS_DIR/basic_helpmsg.log
+	(( CHECK_EXIT_CODE += $? ))
+	../common/check_no_patterns_found.pl "No manual entry for" < $LOGS_DIR/basic_helpmsg.err
+	(( CHECK_EXIT_CODE += $? ))
+
+	print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "help message"
+	(( TEST_RESULT += $? ))
+else
+	print_testcase_skipped "help message"
+fi
+
+
+### usage message
+
+# without any args perf-probe should print usage
+$CMD_PERF probe 2> $LOGS_DIR/basic_usage.log > /dev/null
+
+../common/check_all_patterns_found.pl "[Uu]sage" "perf probe" "verbose" "quiet" "add" "del" "force" "line" "vars" "externs" "range" < $LOGS_DIR/basic_usage.log
+CHECK_EXIT_CODE=$?
+
+print_results 0 $CHECK_EXIT_CODE "usage message"
+(( TEST_RESULT += $? ))
+
+
+### quiet switch
+
+# '--quiet' should mute all output
+$CMD_PERF probe --quiet --add vfs_read > $LOGS_DIR/basic_quiet01.log 2> $LOGS_DIR/basic_quiet01.err
+PERF_EXIT_CODE=$?
+$CMD_PERF probe --quiet --del vfs_read > $LOGS_DIR/basic_quiet03.log 2> $LOGS_DIR/basic_quiet02.err
+(( PERF_EXIT_CODE += $? ))
+
+test "`cat $LOGS_DIR/basic_quiet*log $LOGS_DIR/basic_quiet*err | wc -l`" -eq 0
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "quiet switch"
+(( TEST_RESULT += $? ))
+
+
+# print overall results
+print_overall_results "$TEST_RESULT"
+exit $?
-- 
cgit v1.2.3


From 83b6815dbb57aca3cd9b20bdcae769d2a7c38d5f Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:45 +0200
Subject: perf testsuite probe: Add test for invalid options

Test if various incompatible options are correctly handled-rejected.
It is run as a part of perftool-testsuite_probe test case.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-8-vmolnaro@redhat.com
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../tests/shell/base_probe/test_invalid_options.sh | 79 ++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100755 tools/perf/tests/shell/base_probe/test_invalid_options.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/base_probe/test_invalid_options.sh b/tools/perf/tests/shell/base_probe/test_invalid_options.sh
new file mode 100755
index 000000000000..1fedfd8b0d0d
--- /dev/null
+++ b/tools/perf/tests/shell/base_probe/test_invalid_options.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+
+#
+#	test_invalid_options of perf_probe test
+#	Author: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#
+#	Description:
+#
+#		This test checks whether the invalid and incompatible options are reported
+#
+
+# include working environment
+. ../common/init.sh
+
+TEST_RESULT=0
+
+if ! check_kprobes_available; then
+	print_overall_skipped
+	exit 0
+fi
+
+
+### missing argument
+
+# some options require an argument
+for opt in '-a' '-d' '-L' '-V'; do
+	! $CMD_PERF probe $opt 2> $LOGS_DIR/invalid_options_missing_argument$opt.err
+	PERF_EXIT_CODE=$?
+
+	../common/check_all_patterns_found.pl "Error: switch .* requires a value" < $LOGS_DIR/invalid_options_missing_argument$opt.err
+	CHECK_EXIT_CODE=$?
+
+	print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "missing argument for $opt"
+	(( TEST_RESULT += $? ))
+done
+
+
+### unnecessary argument
+
+# some options may omit the argument
+for opt in '-F' '-l'; do
+	$CMD_PERF probe -F > /dev/null 2> $LOGS_DIR/invalid_options_unnecessary_argument$opt.err
+	PERF_EXIT_CODE=$?
+
+	test ! -s $LOGS_DIR/invalid_options_unnecessary_argument$opt.err
+	CHECK_EXIT_CODE=$?
+
+	print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "unnecessary argument for $opt"
+	(( TEST_RESULT += $? ))
+done
+
+
+### mutually exclusive options
+
+# some options are mutually exclusive
+test -e $LOGS_DIR/invalid_options_mutually_exclusive.log && rm -f $LOGS_DIR/invalid_options_mutually_exclusive.log
+for opt in '-a xxx -d xxx' '-a xxx -L foo' '-a xxx -V foo' '-a xxx -l' '-a xxx -F' \
+		'-d xxx -L foo' '-d xxx -V foo' '-d xxx -l' '-d xxx -F' \
+		'-L foo -V bar' '-L foo -l' '-L foo -F' '-V foo -l' '-V foo -F' '-l -F'; do
+	! $CMD_PERF probe $opt > /dev/null 2> $LOGS_DIR/aux.log
+	PERF_EXIT_CODE=$?
+
+	../common/check_all_patterns_found.pl "Error: switch .+ cannot be used with switch .+" < $LOGS_DIR/aux.log
+	CHECK_EXIT_CODE=$?
+
+	print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "mutually exclusive options :: $opt"
+	(( TEST_RESULT += $? ))
+
+	# gather the logs
+	cat $LOGS_DIR/aux.log | grep "Error" >> $LOGS_DIR/invalid_options_mutually_exclusive.log
+done
+
+
+# print overall results
+print_overall_results "$TEST_RESULT"
+exit $?
-- 
cgit v1.2.3


From c0964af8162e7ce9a7b8bb896ece92534d5e7e64 Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:46 +0200
Subject: perf testsuite probe: Add test for line semantics

The perf-probe command uses a specific semantics to describe probes.
Test some patterns that are known to be both valid and invalid if
they are handled appropriately.

This test is run as a part of perftool-testsuite_probe test case.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-9-vmolnaro@redhat.com
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../tests/shell/base_probe/test_line_semantics.sh  | 55 ++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100755 tools/perf/tests/shell/base_probe/test_line_semantics.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/base_probe/test_line_semantics.sh b/tools/perf/tests/shell/base_probe/test_line_semantics.sh
new file mode 100755
index 000000000000..d8f4bde0f585
--- /dev/null
+++ b/tools/perf/tests/shell/base_probe/test_line_semantics.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+
+#
+#	test_line_semantics of perf_probe test
+#	Author: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#
+#	Description:
+#
+#		This test checks whether the semantic errors of line option's
+#		arguments are properly reported.
+#
+
+# include working environment
+. ../common/init.sh
+
+TEST_RESULT=0
+
+if ! check_kprobes_available; then
+	print_overall_skipped
+	exit 0
+fi
+
+
+### acceptable --line descriptions
+
+# testing acceptance of valid patterns for the '--line' option
+VALID_PATTERNS="func func:10 func:0-10 func:2+10 func@source.c func@source.c:1 source.c:1 source.c:1+1 source.c:1-10"
+for desc in $VALID_PATTERNS; do
+	! ( $CMD_PERF probe --line $desc 2>&1 | grep -q "Semantic error" )
+	CHECK_EXIT_CODE=$?
+
+	print_results 0 $CHECK_EXIT_CODE "acceptable descriptions :: $desc"
+	(( TEST_RESULT += $? ))
+done
+
+
+### unacceptable --line descriptions
+
+# testing handling of invalid patterns for the '--line' option
+INVALID_PATTERNS="func:foo func:1-foo func:1+foo func;lazy\*pattern"
+for desc in $INVALID_PATTERNS; do
+	$CMD_PERF probe --line $desc 2>&1 | grep -q "Semantic error"
+	CHECK_EXIT_CODE=$?
+
+	print_results 0 $CHECK_EXIT_CODE "unacceptable descriptions :: $desc"
+	(( TEST_RESULT += $? ))
+done
+
+
+# print overall results
+print_overall_results "$TEST_RESULT"
+exit $?
-- 
cgit v1.2.3


From 13d58a6672d11200bd3c35604caa9e166468d61b Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:47 +0200
Subject: perf testsuite: Add common output checking helper

As a form of validation, it is a common practice to check the outputs
of commands whether they contain expected patterns or match a certain
regular expression.

This output checking helper is designed to allow checking stderr output
of perf commands for unexpected messages, while ignoring messages that
are known to be harmless, e.g.:
  "Lowering default frequency rate to \d+\."
  "\d+ out of order events recorded."
  etc.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-10-vmolnaro@redhat.com
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../tests/shell/common/check_errors_whitelisted.pl | 51 ++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100755 tools/perf/tests/shell/common/check_errors_whitelisted.pl

(limited to 'tools')

diff --git a/tools/perf/tests/shell/common/check_errors_whitelisted.pl b/tools/perf/tests/shell/common/check_errors_whitelisted.pl
new file mode 100755
index 000000000000..c57d355dd76e
--- /dev/null
+++ b/tools/perf/tests/shell/common/check_errors_whitelisted.pl
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+# SPDX-License-Identifier: GPL-2.0
+
+$whitelist_file = shift;
+
+if (defined $whitelist_file)
+{
+	open (INFILE, $whitelist_file) or die "Checker error: Unable to open the whitelist file: $whitelist_file\n";
+	@regexps = <INFILE>;
+	close INFILE or die "Checker error: Unable to close the whitelist file: $whitelist_file\n";
+}
+else
+{
+	@regexps = ();
+}
+
+$max_printed_lines = 20;
+$max_printed_lines = $ENV{TESTLOG_ERR_MSG_MAX_LINES} if (defined $ENV{TESTLOG_ERR_MSG_MAX_LINES});
+
+$quiet = 1;
+$quiet = 0 if (defined $ENV{TESTLOG_VERBOSITY} && $ENV{TESTLOG_VERBOSITY} ge 2);
+
+$passed = 1;
+$lines_printed = 0;
+
+while (<STDIN>)
+{
+	s/\n//;
+
+	$line_matched = 0;
+	for $r (@regexps)
+	{
+		chomp $r;
+		if (/$r/)
+		{
+			$line_matched = 1;
+			last;
+		}
+	}
+
+	unless ($line_matched)
+	{
+		if ($lines_printed++ < $max_printed_lines)
+		{
+			print "Line did not match any pattern: \"$_\"\n" unless $quiet;
+		}
+		$passed = 0;
+	}
+}
+
+exit ($passed == 0);
-- 
cgit v1.2.3


From 61f87151839b9767a06be93b99f1fc324147a0d3 Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:48 +0200
Subject: perf testsuite report: Add test for perf-report basic functionality

Test basic execution and some options of perf-report subcommand, like
show-nr-samples, header, showcpuutilization, pid and symbol filtering.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-11-vmolnaro@redhat.com
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/base_report/setup.sh        |  32 ++++
 .../tests/shell/base_report/stderr-whitelist.txt   |   5 +
 tools/perf/tests/shell/base_report/test_basic.sh   | 190 +++++++++++++++++++++
 tools/perf/tests/shell/common/settings.sh          |   2 +
 4 files changed, 229 insertions(+)
 create mode 100755 tools/perf/tests/shell/base_report/setup.sh
 create mode 100644 tools/perf/tests/shell/base_report/stderr-whitelist.txt
 create mode 100755 tools/perf/tests/shell/base_report/test_basic.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/base_report/setup.sh b/tools/perf/tests/shell/base_report/setup.sh
new file mode 100755
index 000000000000..4caa496660c6
--- /dev/null
+++ b/tools/perf/tests/shell/base_report/setup.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+
+#
+#	setup.sh of perf report test
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#
+#	Description:
+#
+#		We need some sample data for perf-report testing
+#
+#
+
+# include working environment
+. ../common/init.sh
+
+test -d "$HEADER_TAR_DIR" || mkdir -p "$HEADER_TAR_DIR"
+
+SW_EVENT="cpu-clock"
+
+$CMD_PERF record -asdg -e $SW_EVENT -o $CURRENT_TEST_DIR/perf.data -- $CMD_LONGER_SLEEP 2> $LOGS_DIR/setup.log
+PERF_EXIT_CODE=$?
+
+../common/check_all_patterns_found.pl "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" < $LOGS_DIR/setup.log
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "prepare the perf.data file"
+TEST_RESULT=$?
+
+print_overall_results $TEST_RESULT
+exit $?
diff --git a/tools/perf/tests/shell/base_report/stderr-whitelist.txt b/tools/perf/tests/shell/base_report/stderr-whitelist.txt
new file mode 100644
index 000000000000..e3341401b47c
--- /dev/null
+++ b/tools/perf/tests/shell/base_report/stderr-whitelist.txt
@@ -0,0 +1,5 @@
+no symbols found in .*, maybe install a debug package
+was updated .*is prelink enabled.+ Restart the long running apps that use it
+Warning:
+\d+ out of order events recorded.
+detected invalid bpf_prog_info
diff --git a/tools/perf/tests/shell/base_report/test_basic.sh b/tools/perf/tests/shell/base_report/test_basic.sh
new file mode 100755
index 000000000000..47677cbd4df3
--- /dev/null
+++ b/tools/perf/tests/shell/base_report/test_basic.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+
+#
+#	test_basic of perf_report test
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#
+#	Description:
+#
+#		This test tests basic functionality of perf report command.
+#
+#
+
+# include working environment
+. ../common/init.sh
+
+TEST_RESULT=0
+
+
+### help message
+
+if [ "$PARAM_GENERAL_HELP_TEXT_CHECK" = "y" ]; then
+	# test that a help message is shown and looks reasonable
+	$CMD_PERF report --help > $LOGS_DIR/basic_helpmsg.log 2> $LOGS_DIR/basic_helpmsg.err
+	PERF_EXIT_CODE=$?
+
+	../common/check_all_patterns_found.pl "PERF-REPORT" "NAME" "SYNOPSIS" "DESCRIPTION" "OPTIONS" "OVERHEAD\s+CALCULATION" "SEE ALSO" < $LOGS_DIR/basic_helpmsg.log
+	CHECK_EXIT_CODE=$?
+	../common/check_all_patterns_found.pl "input" "verbose" "show-nr-samples" "show-cpu-utilization" "threads" "comms" "pid" "tid" "dsos" "symbols" "symbol-filter" < $LOGS_DIR/basic_helpmsg.log
+	(( CHECK_EXIT_CODE += $? ))
+	../common/check_all_patterns_found.pl "hide-unresolved" "sort" "fields" "parent" "exclude-other" "column-widths" "field-separator" "dump-raw-trace" "children" < $LOGS_DIR/basic_helpmsg.log
+	(( CHECK_EXIT_CODE += $? ))
+	../common/check_all_patterns_found.pl "call-graph" "max-stack" "inverted" "ignore-callees" "pretty" "stdio" "tui" "gtk" "vmlinux" "kallsyms" "modules" < $LOGS_DIR/basic_helpmsg.log
+	(( CHECK_EXIT_CODE += $? ))
+	../common/check_all_patterns_found.pl "force" "symfs" "cpu" "disassembler-style" "source" "asm-raw" "show-total-period" "show-info" "branch-stack" "group" < $LOGS_DIR/basic_helpmsg.log
+	(( CHECK_EXIT_CODE += $? ))
+	../common/check_all_patterns_found.pl "branch-history" "objdump" "demangle" "percent-limit" "percentage" "header" "itrace" "full-source-path" "show-ref-call-graph" < $LOGS_DIR/basic_helpmsg.log
+	(( CHECK_EXIT_CODE += $? ))
+	../common/check_no_patterns_found.pl "No manual entry for" < $LOGS_DIR/basic_helpmsg.err
+	(( CHECK_EXIT_CODE += $? ))
+
+	print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "help message"
+	(( TEST_RESULT += $? ))
+else
+	print_testcase_skipped "help message"
+fi
+
+
+### basic execution
+
+# test that perf report is even working
+$CMD_PERF report -i $CURRENT_TEST_DIR/perf.data --stdio > $LOGS_DIR/basic_basic.log 2> $LOGS_DIR/basic_basic.err
+PERF_EXIT_CODE=$?
+
+REGEX_LOST_SAMPLES_INFO="#\s*Total Lost Samples:\s+$RE_NUMBER"
+REGEX_SAMPLES_INFO="#\s*Samples:\s+(?:$RE_NUMBER)\w?\s+of\s+event\s+'$RE_EVENT_ANY'"
+REGEX_LINES_HEADER="#\s*Children\s+Self\s+Command\s+Shared Object\s+Symbol"
+REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+"
+../common/check_all_patterns_found.pl "$REGEX_LOST_SAMPLES_INFO" "$REGEX_SAMPLES_INFO" "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_basic.log
+CHECK_EXIT_CODE=$?
+../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_basic.err
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "basic execution"
+(( TEST_RESULT += $? ))
+
+
+### number of samples
+
+# '--show-nr-samples' should show number of samples for each symbol
+$CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --show-nr-samples > $LOGS_DIR/basic_nrsamples.log 2> $LOGS_DIR/basic_nrsamples.err
+PERF_EXIT_CODE=$?
+
+REGEX_LINES_HEADER="#\s*Children\s+Self\s+Samples\s+Command\s+Shared Object\s+Symbol"
+REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+"
+../common/check_all_patterns_found.pl "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_nrsamples.log
+CHECK_EXIT_CODE=$?
+../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_nrsamples.err
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "number of samples"
+(( TEST_RESULT += $? ))
+
+
+### header
+
+# '--header' and '--header-only' should show perf report header
+$CMD_PERF report -i $CURRENT_TEST_DIR/perf.data --stdio --header-only > $LOGS_DIR/basic_header.log
+PERF_EXIT_CODE=$?
+
+REGEX_LINE_TIMESTAMP="#\s+captured on\s*:\s*$RE_DATE_TIME"
+REGEX_LINE_HOSTNAME="#\s+hostname\s*:\s*$MY_HOSTNAME"
+REGEX_LINE_KERNEL="#\s+os release\s*:\s*${MY_KERNEL_VERSION//+/\\+}"
+REGEX_LINE_PERF="#\s+perf version\s*:\s*"
+REGEX_LINE_ARCH="#\s+arch\s*:\s*$MY_ARCH"
+REGEX_LINE_CPUS_ONLINE="#\s+nrcpus online\s*:\s*$MY_CPUS_ONLINE"
+REGEX_LINE_CPUS_AVAIL="#\s+nrcpus avail\s*:\s*$MY_CPUS_AVAILABLE"
+# disable precise check for "nrcpus avail" in BASIC runmode
+test $PERFTOOL_TESTSUITE_RUNMODE -lt $RUNMODE_STANDARD && REGEX_LINE_CPUS_AVAIL="#\s+nrcpus avail\s*:\s*$RE_NUMBER"
+../common/check_all_patterns_found.pl "$REGEX_LINE_TIMESTAMP" "$REGEX_LINE_HOSTNAME" "$REGEX_LINE_KERNEL" "$REGEX_LINE_PERF" "$REGEX_LINE_ARCH" "$REGEX_LINE_CPUS_ONLINE" "$REGEX_LINE_CPUS_AVAIL" < $LOGS_DIR/basic_header.log
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "header"
+(( TEST_RESULT += $? ))
+
+# '--header' and '--header-only' should use creation time
+OLD_TIMESTAMP=`$CMD_PERF report --stdio --header-only -i $CURRENT_TEST_DIR/perf.data | grep "captured on"`
+PERF_EXIT_CODE=$?
+
+( tar -C $CURRENT_TEST_DIR -c perf.data | xz > $CURRENT_TEST_DIR/perf.data.tar.xz ; xzcat $CURRENT_TEST_DIR/perf.data.tar.xz | tar x -C $HEADER_TAR_DIR )
+(( PERF_EXIT_CODE += $? ))
+
+NEW_TIMESTAMP=`$CMD_PERF report --stdio --header-only -i $HEADER_TAR_DIR/perf.data | grep "captured on"`
+(( PERF_EXIT_CODE += $? ))
+
+test "$OLD_TIMESTAMP" = "$NEW_TIMESTAMP"
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "header timestamp"
+(( TEST_RESULT += $? ))
+
+
+### show CPU utilization
+
+# '--showcpuutilization' should show percentage for both system and userspace mode
+$CMD_PERF report -i $CURRENT_TEST_DIR/perf.data --stdio --showcpuutilization > $LOGS_DIR/basic_cpuut.log 2> $LOGS_DIR/basic_cpuut.err
+PERF_EXIT_CODE=$?
+
+REGEX_LINES_HEADER="#\s*Children\s+Self\s+sys\s+usr\s+Command\s+Shared Object\s+Symbol"
+REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER%\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+"
+../common/check_all_patterns_found.pl "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_cpuut.log
+CHECK_EXIT_CODE=$?
+../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_cpuut.err
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "show CPU utilization"
+(( TEST_RESULT += $? ))
+
+
+### pid
+
+# '--pid=' should limit the output for a process with the given pid only
+$CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --pid=1 > $LOGS_DIR/basic_pid.log 2> $LOGS_DIR/basic_pid.err
+PERF_EXIT_CODE=$?
+
+grep -P -v '^#' $LOGS_DIR/basic_pid.log | grep -P '\s+[\d\.]+%' | ../common/check_all_lines_matched.pl "systemd|init"
+CHECK_EXIT_CODE=$?
+../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_pid.err
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "pid"
+(( TEST_RESULT += $? ))
+
+
+### non-existing symbol
+
+# '--symbols' should show only the given symbols
+$CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --symbols=dummynonexistingsymbol > $LOGS_DIR/basic_symbols.log 2> $LOGS_DIR/basic_symbols.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "$RE_LINE_COMMENT" < $LOGS_DIR/basic_symbols.log
+CHECK_EXIT_CODE=$?
+../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_symbols.err
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "non-existing symbol"
+(( TEST_RESULT += $? ))
+
+
+### symbol filter
+
+# '--symbol-filter' should filter symbols based on substrings
+$CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --symbol-filter=map > $LOGS_DIR/basic_symbolfilter.log 2> $LOGS_DIR/basic_symbolfilter.err
+PERF_EXIT_CODE=$?
+
+grep -P -v '^#' $LOGS_DIR/basic_symbolfilter.log | grep -P '\s+[\d\.]+%' | ../common/check_all_lines_matched.pl "\[[k\.]\]\s+.*map"
+CHECK_EXIT_CODE=$?
+../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_symbolfilter.err
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "symbol filter"
+(( TEST_RESULT += $? ))
+
+
+# TODO: $CMD_PERF report -n --showcpuutilization -TUxDg 2> 01.log
+
+# print overall results
+print_overall_results "$TEST_RESULT"
+exit $?
diff --git a/tools/perf/tests/shell/common/settings.sh b/tools/perf/tests/shell/common/settings.sh
index 2e50dd0bff9b..cba1b338f96f 100644
--- a/tools/perf/tests/shell/common/settings.sh
+++ b/tools/perf/tests/shell/common/settings.sh
@@ -80,12 +80,14 @@ if [ -n "$PERFSUITE_RUN_DIR" ]; then
 	export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME"
 	export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples"
 	export LOGS_DIR="$CURRENT_TEST_DIR/logs"
+	export HEADER_TAR_DIR="$CURRENT_TEST_DIR/header_tar"
 	test -d "$CURRENT_TEST_DIR" || mkdir -p "$CURRENT_TEST_DIR"
 	test -d "$LOGS_DIR" || mkdir -p "$LOGS_DIR"
 else
 	# when $PERFSUITE_RUN_DIR is not set, logs will be placed here
 	export CURRENT_TEST_DIR="."
 	export LOGS_DIR="."
+	export HEADER_TAR_DIR="./header_tar"
 fi
 
 
-- 
cgit v1.2.3


From e37cb2a6beed817df510832931bfdaa6bb8a0447 Mon Sep 17 00:00:00 2001
From: Veronika Molnarova <vmolnaro@redhat.com>
Date: Tue, 2 Jul 2024 13:08:49 +0200
Subject: perf testsuite report: Add test case for perf report

Add a new 'perf report' test case that acts as an entry element in 'perf
test list'.

Runs multiple subtests from directory "base_report", which can be
expanded without further editing.

Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-12-vmolnaro@redhat.com
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../perf/tests/shell/perftool-testsuite_report.sh  | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100755 tools/perf/tests/shell/perftool-testsuite_report.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/perftool-testsuite_report.sh b/tools/perf/tests/shell/perftool-testsuite_report.sh
new file mode 100755
index 000000000000..973012ce92a7
--- /dev/null
+++ b/tools/perf/tests/shell/perftool-testsuite_report.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# perftool-testsuite_report
+# SPDX-License-Identifier: GPL-2.0
+
+test -d "$(dirname "$0")/base_report" || exit 2
+cd "$(dirname "$0")/base_report" || exit 2
+status=0
+
+PERFSUITE_RUN_DIR=$(mktemp -d /tmp/"$(basename "$0" .sh)".XXX)
+export PERFSUITE_RUN_DIR
+
+for testcase in setup.sh test_*; do                  # skip setup.sh if not present or not executable
+     test -x "$testcase" || continue
+     ./"$testcase"
+     (( status += $? ))
+done
+
+if ! [ "$PERFTEST_KEEP_LOGS" = "y" ]; then
+	rm -rf "$PERFSUITE_RUN_DIR"
+fi
+
+test $status -ne 0 && exit 1
+exit 0
-- 
cgit v1.2.3


From 097fe67df1aa9cc79092b15c68c9135569a679d6 Mon Sep 17 00:00:00 2001
From: Michael Petlan <mpetlan@redhat.com>
Date: Tue, 2 Jul 2024 13:08:50 +0200
Subject: perf testsuite: Install perf-report tests in the 'make install-tests
 -C tools/perf' target

Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240702110849.31904-13-vmolnaro@redhat.com
Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 9731f5e84131..9dd2e8d3f3c9 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1142,6 +1142,8 @@ install-tests: all install-gtk
 		$(INSTALL) tests/shell/common/*.pl '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/common'; \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_probe'; \
 		$(INSTALL) tests/shell/base_probe/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_probe'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_report'; \
+		$(INSTALL) tests/shell/base_probe/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_report'; \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' ; \
 		$(INSTALL) tests/shell/coresight/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight'
 	$(Q)$(MAKE) -C tests/shell/coresight install-tests
-- 
cgit v1.2.3


From 8b48f8ba16b06024920c8ff349fed769118777cc Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Aug 2024 14:27:57 -0700
Subject: perf report: Name events in stats for pipe mode

In stats mode PERF_RECORD_EVENT_UPDATE isn't being handled meaning the
evsels aren't named when handling pipe mode output.

Before:

  $ perf record -e inst_retired.any -a -o - sleep 0.1|perf report --stats -i -
  ...
  Aggregated stats:
             TOTAL events:      23358
              COMM events:       2608  (11.2%)
              EXIT events:          1  ( 0.0%)
              FORK events:       2607  (11.2%)
            SAMPLE events:        174  ( 0.7%)
             MMAP2 events:      17936  (76.8%)
              ATTR events:          2  ( 0.0%)
    FINISHED_ROUND events:          2  ( 0.0%)
          ID_INDEX events:          1  ( 0.0%)
        THREAD_MAP events:          1  ( 0.0%)
           CPU_MAP events:          1  ( 0.0%)
      EVENT_UPDATE events:          3  ( 0.0%)
         TIME_CONV events:          1  ( 0.0%)
           FEATURE events:         20  ( 0.1%)
     FINISHED_INIT events:          1  ( 0.0%)
  raw 0xc0 stats:
            SAMPLE events:        174

After:

  $ perf record -e inst_retired.any -a -o - sleep 0.1|perf report --stats -i -
  ...
  Aggregated stats:
             TOTAL events:      23742
              COMM events:       2620  (11.0%)
              EXIT events:          2  ( 0.0%)
              FORK events:       2619  (11.0%)
            SAMPLE events:        165  ( 0.7%)
             MMAP2 events:      18304  (77.1%)
              ATTR events:          2  ( 0.0%)
    FINISHED_ROUND events:          2  ( 0.0%)
          ID_INDEX events:          1  ( 0.0%)
        THREAD_MAP events:          1  ( 0.0%)
           CPU_MAP events:          1  ( 0.0%)
      EVENT_UPDATE events:          3  ( 0.0%)
         TIME_CONV events:          1  ( 0.0%)
           FEATURE events:         20  ( 0.1%)
     FINISHED_INIT events:          1  ( 0.0%)
  inst_retired.any stats:
            SAMPLE events:        165

This makes the pipe output match the regular output.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240827212757.1469340-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c304d9b06190..5dc17ffee27a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -816,6 +816,7 @@ static void stats_setup(struct report *rep)
 	rep->tool.attr = process_attr;
 	rep->tool.sample = count_sample_event;
 	rep->tool.lost_samples = count_lost_samples_event;
+	rep->tool.event_update = perf_event__process_event_update;
 	rep->tool.no_warn = true;
 }
 
-- 
cgit v1.2.3


From 4451dae46992131ef5c42147444c183c364b1149 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Thu, 15 Aug 2024 09:36:17 +0800
Subject: perf evlist: Introduce method to find if there is a bpf-output event

We'll use it in the next patch, to deciding how to set up the ring
buffer.

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240815013626.935097-2-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 12 ++++++++++++
 tools/perf/util/evlist.h |  1 +
 2 files changed, 13 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index c0379fa1ccfe..ef58a7764318 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -2627,3 +2627,15 @@ void evlist__uniquify_name(struct evlist *evlist)
 		}
 	}
 }
+
+bool evlist__has_bpf_output(struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel__is_bpf_output(evsel))
+			return true;
+	}
+
+	return false;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index b46f1a320783..bcc1c6984bb5 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -447,5 +447,6 @@ int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf);
 void evlist__check_mem_load_aux(struct evlist *evlist);
 void evlist__warn_user_requested_cpus(struct evlist *evlist, const char *cpu_list);
 void evlist__uniquify_name(struct evlist *evlist);
+bool evlist__has_bpf_output(struct evlist *evlist);
 
 #endif /* __PERF_EVLIST_H */
-- 
cgit v1.2.3


From 8df1d8c6cbd6825b3784068e7c2b37fa8a8a43f0 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Thu, 15 Aug 2024 09:36:17 +0800
Subject: perf trace: Fix perf trace -p <PID>

'perf trace -p <PID>' work on a syscall that is unaugmented, but doesn't
work on a syscall that's augmented (when it calls perf_event_output() in
BPF).

Let's take open() as an example. open() is augmented in perf trace.

Before:

  $ perf trace -e open -p 3792392
     ? (         ):  ... [continued]: open()) = -1 ENOENT (No such file or directory)
     ? (         ):  ... [continued]: open()) = -1 ENOENT (No such file or directory)

We can see there's no output.

After:

   $ perf trace -e open -p 3792392
      0.000 ( 0.123 ms): a.out/3792392 open(filename: "DINGZHEN", flags: WRONLY) = -1 ENOENT (No such file or directory)
   1000.398 ( 0.116 ms): a.out/3792392 open(filename: "DINGZHEN", flags: WRONLY) = -1 ENOENT (No such file or directory)

Reason:

bpf_perf_event_output() will fail when you specify a pid in 'perf trace' (EOPNOTSUPP).

When using 'perf trace -p 114', before perf_event_open(), we'll have PID
= 114, and CPU = -1.

This is bad for bpf-output event, because the ring buffer won't accept
output from BPF's perf_event_output(), making it fail. I'm still trying
to find out why.

If we open bpf-output for every cpu, instead of setting it to -1, like
this:

  PID = <PID>, CPU = 0
  PID = <PID>, CPU = 1
  PID = <PID>, CPU = 2
  PID = <PID>, CPU = 3

Everything works.

You can test it with this script (open.c):

  #include <unistd.h>
  #include <sys/syscall.h>

  int main()
  {
	int i1 = 1, i2 = 2, i3 = 3, i4 = 4;
	char s1[] = "DINGZHEN", s2[] = "XUEBAO";

	while (1) {
		syscall(SYS_open, s1, i1, i2);
		sleep(1);
	}

	return 0;
  }

save, compile:

  make open

perf trace:

  perf trace -e open <path-to-the-executable>

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240815013626.935097-2-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index ef58a7764318..f14b7e6ff1dc 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1067,7 +1067,7 @@ int evlist__create_maps(struct evlist *evlist, struct target *target)
 	if (!threads)
 		return -1;
 
-	if (target__uses_dummy_map(target))
+	if (target__uses_dummy_map(target) && !evlist__has_bpf_output(evlist))
 		cpus = perf_cpu_map__new_any_cpu();
 	else
 		cpus = perf_cpu_map__new(target->cpu_list);
-- 
cgit v1.2.3


From 7bedcbaefdf5d4f71302cb5428eb72ada4f380e9 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 22 Aug 2024 15:10:27 -0300
Subject: perf trace: Pass the richer 'struct syscall_arg' pointer to
 trace__btf_scnprintf()

Since we'll need it later in the current patch series and we can get the
syscall_arg_fmt from syscall_arg->fmt.

Based-on-a-patch-by: Howard Chu <howardchu95@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/Zsd8vqCrTh5h69rp@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d6ca541fdc78..53690bbb143e 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -989,9 +989,11 @@ static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, c
 	return 0;
 }
 
-static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf,
+static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf,
 				   size_t size, int val, char *type)
 {
+	struct syscall_arg_fmt *arg_fmt = arg->fmt;
+
 	if (trace->btf == NULL)
 		return 0;
 
@@ -1011,7 +1013,7 @@ static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg_fmt *
 }
 
 #else // HAVE_LIBBPF_SUPPORT
-static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg_fmt *arg_fmt __maybe_unused,
+static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused,
 				   char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
 				   char *type __maybe_unused)
 {
@@ -2260,7 +2262,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 			if (trace->show_arg_names)
 				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
 
-			btf_printed = trace__btf_scnprintf(trace, &sc->arg_fmt[arg.idx], bf + printed,
+			btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
 							   size - printed, val, field->type);
 			if (btf_printed) {
 				printed += btf_printed;
@@ -2963,7 +2965,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
 		if (trace->show_arg_names)
 			printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
 
-		btf_printed = trace__btf_scnprintf(trace, arg, bf + printed, size - printed, val, field->type);
+		btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type);
 		if (btf_printed) {
 			printed += btf_printed;
 			continue;
-- 
cgit v1.2.3


From c5d50457a8fc2695ca5a921f3a4a402343cf5313 Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Tue, 27 Aug 2024 11:02:04 +0530
Subject: perf vendor events power10: Update JSON/events

Update JSON/events for power10 platform with additional events.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20240827053206.538814-1-kjain@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../arch/powerpc/power10/datasource.json           | 25 ++++++++++++++++++++++
 .../pmu-events/arch/powerpc/power10/frontend.json  | 10 +++++++++
 .../perf/pmu-events/arch/powerpc/power10/pmc.json  |  5 +++++
 3 files changed, 40 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json
index 0eeaaf1a95b8..283284745d9c 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json
@@ -14,6 +14,31 @@
     "EventName": "PM_DATA_FROM_MEMORY",
     "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss."
   },
+  {
+    "EventCode": "0x0000004080",
+    "EventName": "PM_INST_FROM_L1",
+    "BriefDescription": "An instruction fetch hit in the L1. Each fetch group contains 8 instructions. The same line can hit 4 times if 32 sequential instructions are fetched."
+  },
+  {
+    "EventCode": "0x000000026080",
+    "EventName": "PM_L2_LD_MISS",
+    "BriefDescription": "All successful D-Side Load dispatches for this thread that missed in the L2. Since the event happens in a 2:1 clock domain and is time-sliced across all 4 threads, the event count should be multiplied by 2."
+  },
+  {
+    "EventCode": "0x000000026880",
+    "EventName": "PM_L2_ST_MISS",
+    "BriefDescription": "All successful D-Side Store dispatches for this thread that missed in the L2. Since the event happens in a 2:1 clock domain and is time-sliced across all 4 threads, the event count should be multiplied by 2."
+  },
+  {
+    "EventCode": "0x010000046880",
+    "EventName": "PM_L2_ST_HIT",
+    "BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits. Since the event happens in a 2:1 clock domain and is time-sliced across all 4 threads, the event count should be multiplied by 2."
+  },
+  {
+    "EventCode": "0x000000036880",
+    "EventName": "PM_L2_INST_MISS",
+    "BriefDescription": "All successful instruction (demand and prefetch) dispatches for this thread that missed in the L2. Since the event happens in a 2:1 clock domain and is time-sliced across all 4 threads, the event count should be multiplied by 2."
+  },
   {
     "EventCode": "0x000300000000C040",
     "EventName": "PM_INST_FROM_L2",
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
index 53660c279286..456971f60814 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
@@ -93,5 +93,15 @@
     "EventCode": "0x400FC",
     "EventName": "PM_ITLB_MISS",
     "BriefDescription": "Instruction TLB reload (after a miss), all page sizes. Includes only demand misses."
+  },
+  {
+    "EventCode": "0x00000040B8",
+    "EventName": "PM_PRED_BR_TKN_COND_DIR",
+    "BriefDescription": "A conditional branch finished with correctly predicted direction. Resolved taken."
+  },
+  {
+    "EventCode": "0x00000048B8",
+    "EventName": "PM_PRED_BR_NTKN_COND_DIR",
+    "BriefDescription": "A conditional branch finished with correctly predicted direction. Resolved not taken."
   }
 ]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
index 0e0253d0e757..04732698d9b2 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
@@ -104,6 +104,11 @@
     "EventName": "PM_RUN_CYC",
     "BriefDescription": "Processor cycles gated by the run latch."
   },
+  {
+    "EventCode": "0x200F8",
+    "EventName": "PM_EXT_INT",
+    "BriefDescription": "Cycles an external interrupt was active."
+  },
   {
     "EventCode": "0x30010",
     "EventName": "PM_PMC2_OVERFLOW",
-- 
cgit v1.2.3


From 0edee819712ed00ce199f94108a8d588970eea80 Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Tue, 27 Aug 2024 11:02:05 +0530
Subject: perf vendor events power10: Move the JSON/events

Move some of the JSON/events from others.json to more appropriate JSON
files for power10 platform.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20240827053206.538814-2-kjain@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../pmu-events/arch/powerpc/power10/cache.json     |  15 +++
 .../arch/powerpc/power10/datasource.json           |  15 +++
 .../pmu-events/arch/powerpc/power10/frontend.json  |  10 ++
 .../pmu-events/arch/powerpc/power10/locks.json     |  10 ++
 .../pmu-events/arch/powerpc/power10/memory.json    |  30 +++++
 .../pmu-events/arch/powerpc/power10/others.json    | 130 ---------------------
 .../pmu-events/arch/powerpc/power10/pipeline.json  |  45 +++++++
 .../perf/pmu-events/arch/powerpc/power10/pmc.json  |   5 +
 8 files changed, 130 insertions(+), 130 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
index 839ae26945fb..9814a59fce31 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/cache.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
@@ -1,4 +1,19 @@
 [
+  {
+    "EventCode": "0x1002C",
+    "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS",
+    "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request."
+  },
+  {
+    "EventCode": "0x200FD",
+    "EventName": "PM_L1_ICACHE_MISS",
+    "BriefDescription": "Demand instruction cache miss."
+  },
+  {
+    "EventCode": "0x30068",
+    "EventName": "PM_L1_ICACHE_RELOADED_PREF",
+    "BriefDescription": "Counts all instruction cache prefetch reloads (includes demand turned into prefetch)."
+  },
   {
     "EventCode": "0x300F4",
     "EventName": "PM_RUN_INST_CMPL_CONC",
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json
index 283284745d9c..a5d5be35b5e6 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json
@@ -1,4 +1,14 @@
 [
+  {
+    "EventCode": "0x1505E",
+    "EventName": "PM_LD_HIT_L1",
+    "BriefDescription": "Load  finished without experiencing an L1 miss."
+  },
+  {
+    "EventCode": "0x100FC",
+    "EventName": "PM_LD_REF_L1",
+    "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included."
+  },
   {
     "EventCode": "0x200FE",
     "EventName": "PM_DATA_FROM_L2MISS",
@@ -9,6 +19,11 @@
     "EventName": "PM_DATA_FROM_L3MISS",
     "BriefDescription": "The processor's L1 data cache was reloaded from beyond the local core's L3 due to a demand miss."
   },
+  {
+    "EventCode": "0x400F0",
+    "EventName": "PM_LD_DEMAND_MISS_L1_FIN",
+    "BriefDescription": "Load missed L1, counted at finish time."
+  },
   {
     "EventCode": "0x400FE",
     "EventName": "PM_DATA_FROM_MEMORY",
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
index 456971f60814..684374fe5699 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
@@ -84,6 +84,11 @@
     "EventName": "PM_VECTOR_ST_CMPL",
     "BriefDescription": "Vector store instruction completed."
   },
+  {
+    "EventCode": "0x4D05E",
+    "EventName": "PM_BR_CMPL",
+    "BriefDescription": "A branch completed. All branches are included."
+  },
   {
     "EventCode": "0x4E054",
     "EventName": "PM_DTLB_HIT_1G",
@@ -94,6 +99,11 @@
     "EventName": "PM_ITLB_MISS",
     "BriefDescription": "Instruction TLB reload (after a miss), all page sizes. Includes only demand misses."
   },
+  {
+    "EventCode": "0x00000048B4",
+    "EventName": "PM_BR_TKN_UNCOND_FIN",
+    "BriefDescription": "An unconditional branch finished. All unconditional branches are taken."
+  },
   {
     "EventCode": "0x00000040B8",
     "EventName": "PM_PRED_BR_TKN_COND_DIR",
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/locks.json b/tools/perf/pmu-events/arch/powerpc/power10/locks.json
index b5a0d6521963..a8ea4d0def1a 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/locks.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/locks.json
@@ -4,9 +4,19 @@
     "EventName": "PM_STCX_FAIL_FIN",
     "BriefDescription": "Conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock."
   },
+  {
+    "EventCode": "0x2E014",
+    "EventName": "PM_STCX_FIN",
+    "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
   {
     "EventCode": "0x4E050",
     "EventName": "PM_STCX_PASS_FIN",
     "BriefDescription": "Conditional store instruction (STCX) passed. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "0x000000C8B8",
+    "EventName": "PM_STCX_SUCCESS_CMPL",
+    "BriefDescription": "STCX instructions that completed successfully. Specifically, counts only when a pass status is returned from the nest."
   }
 ]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/memory.json b/tools/perf/pmu-events/arch/powerpc/power10/memory.json
index 885262957beb..0d7191b3f2c6 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/memory.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/memory.json
@@ -69,6 +69,11 @@
     "EventName": "PM_XFER_FROM_SRC_PMC3",
     "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
+  {
+    "EventCode": "0x3F04A",
+    "EventName": "PM_LSU_ST5_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST2 port."
+  },
   {
     "EventCode": "0x3C054",
     "EventName": "PM_DERAT_MISS_16M",
@@ -108,5 +113,30 @@
     "EventCode": "0x4C05A",
     "EventName": "PM_DTLB_MISS_1G",
     "BriefDescription": "Data TLB reload (after a miss) page size 1G. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "0x000000F880",
+    "EventName": "PM_SNOOP_TLBIE_CYC",
+    "BriefDescription": "Cycles in which TLBIE snoops are executed in the LSU."
+  },
+  {
+    "EventCode": "0x000000F084",
+    "EventName": "PM_SNOOP_TLBIE_CACHE_WALK_CYC",
+    "BriefDescription": "TLBIE snoop cycles in which the data cache is being walked."
+  },
+  {
+    "EventCode": "0x000000F884",
+    "EventName": "PM_SNOOP_TLBIE_WAIT_ST_CYC",
+    "BriefDescription": "TLBIE snoop cycles in which older stores are still draining."
+  },
+  {
+    "EventCode": "0x000000F088",
+    "EventName": "PM_SNOOP_TLBIE_WAIT_LD_CYC",
+    "BriefDescription": "TLBIE snoop cycles in which older loads are still draining."
+  },
+  {
+    "EventCode": "0x000000F08C",
+    "EventName": "PM_SNOOP_TLBIE_WAIT_MMU_CYC",
+    "BriefDescription": "TLBIE snoop cycles in which the Load-Store unit is waiting for the MMU to finish invalidation."
   }
 ]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json
index 3789304cb363..1bf802076ee0 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/others.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/others.json
@@ -1,89 +1,19 @@
 [
-  {
-    "EventCode": "0x1002C",
-    "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS",
-    "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request."
-  },
-  {
-    "EventCode": "0x1505E",
-    "EventName": "PM_LD_HIT_L1",
-    "BriefDescription": "Load finished without experiencing an L1 miss."
-  },
-  {
-    "EventCode": "0x1F056",
-    "EventName": "PM_DISP_SS0_2_INSTR_CYC",
-    "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions."
-  },
-  {
-    "EventCode": "0x1F05A",
-    "EventName": "PM_DISP_HELD_SYNC_CYC",
-    "BriefDescription": "Cycles dispatch is held because of a synchronizing instruction that requires the ICT to be empty before dispatch."
-  },
   {
     "EventCode": "0x10066",
     "EventName": "PM_ADJUNCT_CYC",
     "BriefDescription": "Cycles in which the thread is in Adjunct state. MSR[S HV PR] bits = 011."
   },
-  {
-    "EventCode": "0x100FC",
-    "EventName": "PM_LD_REF_L1",
-    "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included."
-  },
   {
     "EventCode": "0x2E010",
     "EventName": "PM_ADJUNCT_INST_CMPL",
     "BriefDescription": "PowerPC instruction completed while the thread was in Adjunct state."
   },
-  {
-    "EventCode": "0x2E014",
-    "EventName": "PM_STCX_FIN",
-    "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock."
-  },
-  {
-    "EventCode": "0x2F054",
-    "EventName": "PM_DISP_SS1_2_INSTR_CYC",
-    "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions."
-  },
-  {
-    "EventCode": "0x2F056",
-    "EventName": "PM_DISP_SS1_4_INSTR_CYC",
-    "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions."
-  },
   {
     "EventCode": "0x200F2",
     "EventName": "PM_INST_DISP",
     "BriefDescription": "PowerPC instruction dispatched."
   },
-  {
-    "EventCode": "0x200FD",
-    "EventName": "PM_L1_ICACHE_MISS",
-    "BriefDescription": "Demand instruction cache miss."
-  },
-  {
-    "EventCode": "0x3F04A",
-    "EventName": "PM_LSU_ST5_FIN",
-    "BriefDescription": "LSU Finished an internal operation in ST2 port."
-  },
-  {
-    "EventCode": "0x3405A",
-    "EventName": "PM_PRIVILEGED_INST_CMPL",
-    "BriefDescription": "PowerPC instruction completed while the thread was in Privileged state."
-  },
-  {
-    "EventCode": "0x3F054",
-    "EventName": "PM_DISP_SS0_4_INSTR_CYC",
-    "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions."
-  },
-  {
-    "EventCode": "0x3F056",
-    "EventName": "PM_DISP_SS0_8_INSTR_CYC",
-    "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions."
-  },
-  {
-    "EventCode": "0x30068",
-    "EventName": "PM_L1_ICACHE_RELOADED_PREF",
-    "BriefDescription": "Counts all instruction cache prefetch reloads (includes demand turned into prefetch)."
-  },
   {
     "EventCode": "0x300F6",
     "EventName": "PM_LD_DEMAND_MISS_L1",
@@ -94,16 +24,6 @@
     "EventName": "PM_L1_ICACHE_RELOADED_ALL",
     "BriefDescription": "Counts all instruction cache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch."
   },
-  {
-    "EventCode": "0x4D05E",
-    "EventName": "PM_BR_CMPL",
-    "BriefDescription": "A branch completed. All branches are included."
-  },
-  {
-    "EventCode": "0x400F0",
-    "EventName": "PM_LD_DEMAND_MISS_L1_FIN",
-    "BriefDescription": "Load missed L1, counted at finish time."
-  },
   {
     "EventCode": "0x00000038BC",
     "EventName": "PM_ISYNC_CMPL",
@@ -139,64 +59,14 @@
     "EventName": "PM_ST1_UNALIGNED_FIN",
     "BriefDescription": "Store instructions in ST1 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline. This typically adds about 10 cycles to the latency of the instruction. This only includes stores that cross the 128 byte boundary. Counted at finish time."
   },
-  {
-    "EventCode": "0x000000C8B8",
-    "EventName": "PM_STCX_SUCCESS_CMPL",
-    "BriefDescription": "STCX instructions that completed successfully. Specifically, counts only when a pass status is returned from the nest."
-  },
   {
     "EventCode": "0x000000D0B4",
     "EventName": "PM_DC_PREF_STRIDED_CONF",
     "BriefDescription": "A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software."
   },
-  {
-    "EventCode": "0x000000F880",
-    "EventName": "PM_SNOOP_TLBIE_CYC",
-    "BriefDescription": "Cycles in which TLBIE snoops are executed in the LSU."
-  },
-  {
-    "EventCode": "0x000000F084",
-    "EventName": "PM_SNOOP_TLBIE_CACHE_WALK_CYC",
-    "BriefDescription": "TLBIE snoop cycles in which the data cache is being walked."
-  },
-  {
-    "EventCode": "0x000000F884",
-    "EventName": "PM_SNOOP_TLBIE_WAIT_ST_CYC",
-    "BriefDescription": "TLBIE snoop cycles in which older stores are still draining."
-  },
-  {
-    "EventCode": "0x000000F088",
-    "EventName": "PM_SNOOP_TLBIE_WAIT_LD_CYC",
-    "BriefDescription": "TLBIE snoop cycles in which older loads are still draining."
-  },
-  {
-    "EventCode": "0x000000F08C",
-    "EventName": "PM_SNOOP_TLBIE_WAIT_MMU_CYC",
-    "BriefDescription": "TLBIE snoop cycles in which the Load-Store unit is waiting for the MMU to finish invalidation."
-  },
   {
     "EventCode": "0x0000004884",
     "EventName": "PM_NO_FETCH_IBUF_FULL_CYC",
     "BriefDescription": "Cycles in which no instructions are fetched because there is no room in the instruction buffers."
-  },
-  {
-    "EventCode": "0x00000048B4",
-    "EventName": "PM_BR_TKN_UNCOND_FIN",
-    "BriefDescription": "An unconditional branch finished. All unconditional branches are taken."
-  },
-  {
-    "EventCode": "0x0B0000016080",
-    "EventName": "PM_L2_TLBIE_SLBIE_START",
-    "BriefDescription": "NCU Master received a TLBIE/SLBIEG/SLBIAG operation from the core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
-  },
-  {
-    "EventCode": "0x0B0000016880",
-    "EventName": "PM_L2_TLBIE_SLBIE_DELAY",
-    "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG command was held in a hottemp condition by the NCU Master. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_TLBIE_SLBIE_SENT to obtain the average time a TLBIE/SLBIEG/SLBIAG command was held. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
-  },
-  {
-    "EventCode": "0x0B0000026880",
-    "EventName": "PM_L2_SNP_TLBIE_SLBIE_DELAY",
-    "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG that targets this thread's LPAR was in flight while in a hottemp condition. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_SNP_TLBIE_SLBIE_START to obtain the overall efficiency. Note: 'inflight' means SnpTLB has been sent to core(ie doesn't include when SnpTLB is in NCU waiting to be launched serially behind different SnpTLB). The NCU Snooper gets in a 'hottemp' delay window when it detects it is above its TLBIE/SLBIE threshold for process SnpTLBIE/SLBIE with this core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
   }
 ]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
index 21b23bb55d0d..940375d251cb 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
@@ -94,11 +94,21 @@
     "EventName": "PM_CMPL_STALL_LWSYNC",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete."
   },
+  {
+    "EventCode": "0x1F056",
+    "EventName": "PM_DISP_SS0_2_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions."
+  },
   {
     "EventCode": "0x1F058",
     "EventName": "PM_DISP_HELD_CYC",
     "BriefDescription": "Cycles dispatch is held."
   },
+  {
+    "EventCode": "0x1F05A",
+    "EventName": "PM_DISP_HELD_SYNC_CYC",
+    "BriefDescription": "Cycles dispatch is held because of a synchronizing instruction that requires the ICT to be empty before dispatch."
+  },
   {
     "EventCode": "0x10064",
     "EventName": "PM_DISP_STALL_IC_L2",
@@ -229,6 +239,16 @@
     "EventName": "PM_NTC_FIN",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline (NTC) finishes. Note that instructions can finish out of order, therefore not all the instructions that finish have a Next-to-complete status."
   },
+  {
+    "EventCode": "0x2F054",
+    "EventName": "PM_DISP_SS1_2_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions."
+  },
+  {
+    "EventCode": "0x2F056",
+    "EventName": "PM_DISP_SS1_4_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions."
+  },
   {
     "EventCode": "0x20066",
     "EventName": "PM_DISP_HELD_OTHER_CYC",
@@ -329,6 +349,16 @@
     "EventName": "PM_DISP_STALL_IC_L3",
     "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3."
   },
+  {
+    "EventCode": "0x3F054",
+    "EventName": "PM_DISP_SS0_4_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions."
+  },
+  {
+    "EventCode": "0x3F056",
+    "EventName": "PM_DISP_SS0_8_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions."
+  },
   {
     "EventCode": "0x30060",
     "EventName": "PM_DISP_HELD_XVFC_MAPPER_CYC",
@@ -458,5 +488,20 @@
     "EventCode": "0x400F8",
     "EventName": "PM_FLUSH",
     "BriefDescription": "Flush (any type)."
+  },
+  {
+    "EventCode": "0x0B0000016080",
+    "EventName": "PM_L2_TLBIE_SLBIE_START",
+    "BriefDescription": "NCU Master received a TLBIE/SLBIEG/SLBIAG operation from the core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
+  },
+  {
+    "EventCode": "0x0B0000016880",
+    "EventName": "PM_L2_TLBIE_SLBIE_DELAY",
+    "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG command was held in a hottemp condition by the NCU Master. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_TLBIE_SLBIE_SENT to obtain the average time a TLBIE/SLBIEG/SLBIAG command was held. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
+  },
+  {
+    "EventCode": "0x0B0000026880",
+    "EventName": "PM_L2_SNP_TLBIE_SLBIE_DELAY",
+    "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG that targets this thread's LPAR was in flight while in a hottemp condition. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_SNP_TLBIE_SLBIE_START to obtain the overall efficiency. Note: 'inflight' means SnpTLB has been sent to core(ie doesn't include when SnpTLB is in NCU waiting to be launched serially behind different SnpTLB). The NCU Snooper gets in a 'hottemp' delay window when it detects it is above its TLBIE/SLBIE threshold for process SnpTLBIE/SLBIE with this core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads."
   }
 ]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
index 04732698d9b2..6f5b0e8fde12 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
@@ -129,6 +129,11 @@
     "EventName": "PM_PMC6_OVERFLOW",
     "BriefDescription": "The event selected for PMC6 caused the event counter to overflow."
   },
+  {
+    "EventCode": "0x3405A",
+    "EventName": "PM_PRIVILEGED_INST_CMPL",
+    "BriefDescription": "PowerPC instruction completed while the thread was in Privileged state."
+  },
   {
     "EventCode": "0x3006C",
     "EventName": "PM_RUN_CYC_SMT2_MODE",
-- 
cgit v1.2.3


From adf50a6e66ae7527f9db4f5c9c11e6cc2a870d4f Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Tue, 27 Aug 2024 11:02:06 +0530
Subject: perf vendor events: Move PM_BR_MPRED_CMPL event for power10 platform

Move PM_BR_MPRED_CMPL event from cache.json to frontend.json file
for power10 platform

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20240827053206.538814-3-kjain@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/arch/powerpc/power10/cache.json    | 5 -----
 tools/perf/pmu-events/arch/powerpc/power10/frontend.json | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
index 9814a59fce31..b7e0be09ff57 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/cache.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
@@ -18,10 +18,5 @@
     "EventCode": "0x300F4",
     "EventName": "PM_RUN_INST_CMPL_CONC",
     "BriefDescription": "PowerPC instruction completed by this thread when all threads in the core had the run-latch set."
-  },
-  {
-    "EventCode": "0x400F6",
-    "EventName": "PM_BR_MPRED_CMPL",
-    "BriefDescription": "A mispredicted branch completed. Includes direction and target."
   }
 ]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
index 684374fe5699..b6998987ab75 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
@@ -94,6 +94,11 @@
     "EventName": "PM_DTLB_HIT_1G",
     "BriefDescription": "Data TLB hit (DERAT reload) page size 1G. Implies radix translation. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
+  {
+    "EventCode": "0x400F6",
+    "EventName": "PM_BR_MPRED_CMPL",
+    "BriefDescription": "A mispredicted branch completed. Includes direction and target."
+  },
   {
     "EventCode": "0x400FC",
     "EventName": "PM_ITLB_MISS",
-- 
cgit v1.2.3


From aea4d463459c95a1b19627d1aa9c21b691da843a Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Mon, 1 Jul 2024 19:57:35 +0200
Subject: perf vendor events arm64: Move Yitian 710 DDR PMU into T-Head
 directory

The Yitian 710 is not a Freescale/NXP design and thus should
be located in a separate T-Head vendor directory.

Reviewed-by: Jing Zhang <renyu.zj@linux.alibaba.com>
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuai Xue <xueshuai@linux.alibaba.com>
Cc: Will Deacon <will@kernel.org>
Cc: kernel@pengutronix.de
Cc: linux-arm-kernel@lists.infradead.org
Cc: patchwork-lst@pengutronix.de
Link: https://lore.kernel.org/r/20240701175735.485655-1-l.stach@pengutronix.de
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../arm64/freescale/yitian710/sys/ali_drw.json     | 373 ---------------------
 .../arm64/freescale/yitian710/sys/metrics.json     |  20 --
 .../arch/arm64/thead/yitian710/sys/ali_drw.json    | 373 +++++++++++++++++++++
 .../arch/arm64/thead/yitian710/sys/metrics.json    |  20 ++
 4 files changed, 393 insertions(+), 393 deletions(-)
 delete mode 100644 tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/ali_drw.json
 delete mode 100644 tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/metrics.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/ali_drw.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/metrics.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/ali_drw.json b/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/ali_drw.json
deleted file mode 100644
index e21c469a8ef0..000000000000
--- a/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/ali_drw.json
+++ /dev/null
@@ -1,373 +0,0 @@
-[
-	{
-		"BriefDescription": "A Write or Read Op at HIF interface. The unit is 64B.",
-		"ConfigCode": "0x0",
-		"EventName": "hif_rd_or_wr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Write Op at HIF interface. The unit is 64B.",
-		"ConfigCode": "0x1",
-		"EventName": "hif_wr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Read Op at HIF interface. The unit is 64B.",
-		"ConfigCode": "0x2",
-		"EventName": "hif_rd",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Read-Modify-Write Op at HIF interface. The unit is 64B.",
-		"ConfigCode": "0x3",
-		"EventName": "hif_rmw",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A high priority Read at HIF interface. The unit is 64B.",
-		"ConfigCode": "0x4",
-		"EventName": "hif_hi_pri_rd",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A write data cycle at DFI interface (to DRAM).",
-		"ConfigCode": "0x7",
-		"EventName": "dfi_wr_data_cycles",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A read data cycle at DFI interface (to DRAM).",
-		"ConfigCode": "0x8",
-		"EventName": "dfi_rd_data_cycles",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A high priority read becomes critical.",
-		"ConfigCode": "0x9",
-		"EventName": "hpr_xact_when_critical",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A low priority read becomes critical.",
-		"ConfigCode": "0xA",
-		"EventName": "lpr_xact_when_critical",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A write becomes critical.",
-		"ConfigCode": "0xB",
-		"EventName": "wr_xact_when_critical",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "An Activate(ACT) command to DRAM.",
-		"ConfigCode": "0xC",
-		"EventName": "op_is_activate",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Read or Write CAS command to DRAM.",
-		"ConfigCode": "0xD",
-		"EventName": "op_is_rd_or_wr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "An Activate(ACT) command for read to DRAM.",
-		"ConfigCode": "0xE",
-		"EventName": "op_is_rd_activate",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Read CAS command to DRAM.",
-		"ConfigCode": "0xF",
-		"EventName": "op_is_rd",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Write CAS command to DRAM.",
-		"ConfigCode": "0x10",
-		"EventName": "op_is_wr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Masked Write command to DRAM.",
-		"ConfigCode": "0x11",
-		"EventName": "op_is_mwr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Precharge(PRE) command to DRAM.",
-		"ConfigCode": "0x12",
-		"EventName": "op_is_precharge",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Precharge(PRE) required by read or write.",
-		"ConfigCode": "0x13",
-		"EventName": "precharge_for_rdwr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Precharge(PRE) required by other conditions.",
-		"ConfigCode": "0x14",
-		"EventName": "precharge_for_other",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A read-write turnaround.",
-		"ConfigCode": "0x15",
-		"EventName": "rdwr_transitions",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A write combine(merge) in write data buffer.",
-		"ConfigCode": "0x16",
-		"EventName": "write_combine",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Write-After-Read hazard.",
-		"ConfigCode": "0x17",
-		"EventName": "war_hazard",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Read-After-Write hazard.",
-		"ConfigCode": "0x18",
-		"EventName": "raw_hazard",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Write-After-Write hazard.",
-		"ConfigCode": "0x19",
-		"EventName": "waw_hazard",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "Rank0 enters self-refresh(SRE).",
-		"ConfigCode": "0x1A",
-		"EventName": "op_is_enter_selfref_rk0",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "Rank1 enters self-refresh(SRE).",
-		"ConfigCode": "0x1B",
-		"EventName": "op_is_enter_selfref_rk1",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "Rank2 enters self-refresh(SRE).",
-		"ConfigCode": "0x1C",
-		"EventName": "op_is_enter_selfref_rk2",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "Rank3 enters self-refresh(SRE).",
-		"ConfigCode": "0x1D",
-		"EventName": "op_is_enter_selfref_rk3",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "Rank0 enters power-down(PDE).",
-		"ConfigCode": "0x1E",
-		"EventName": "op_is_enter_powerdown_rk0",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "Rank1 enters power-down(PDE).",
-		"ConfigCode": "0x1F",
-		"EventName": "op_is_enter_powerdown_rk1",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "Rank2 enters power-down(PDE).",
-		"ConfigCode": "0x20",
-		"EventName": "op_is_enter_powerdown_rk2",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "Rank3 enters power-down(PDE).",
-		"ConfigCode": "0x21",
-		"EventName": "op_is_enter_powerdown_rk3",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A cycle that Rank0 stays in self-refresh mode.",
-		"ConfigCode": "0x26",
-		"EventName": "selfref_mode_rk0",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A cycle that Rank1 stays in self-refresh mode.",
-		"ConfigCode": "0x27",
-		"EventName": "selfref_mode_rk1",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A cycle that Rank2 stays in self-refresh mode.",
-		"ConfigCode": "0x28",
-		"EventName": "selfref_mode_rk2",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A cycle that Rank3 stays in self-refresh mode.",
-		"ConfigCode": "0x29",
-		"EventName": "selfref_mode_rk3",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "An auto-refresh(REF) command to DRAM.",
-		"ConfigCode": "0x2A",
-		"EventName": "op_is_refresh",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A critical auto-refresh(REF) command to DRAM.",
-		"ConfigCode": "0x2B",
-		"EventName": "op_is_crit_ref",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "An MRR or MRW command to DRAM.",
-		"ConfigCode": "0x2D",
-		"EventName": "op_is_load_mode",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A ZQCal command to DRAM.",
-		"ConfigCode": "0x2E",
-		"EventName": "op_is_zqcl",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "At least one entry in read queue reaches the visible window limit.",
-		"ConfigCode": "0x30",
-		"EventName": "visible_window_limit_reached_rd",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "At least one entry in write queue reaches the visible window limit.",
-		"ConfigCode": "0x31",
-		"EventName": "visible_window_limit_reached_wr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A DQS Oscillator MPC command to DRAM.",
-		"ConfigCode": "0x34",
-		"EventName": "op_is_dqsosc_mpc",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A DQS Oscillator MRR command to DRAM.",
-		"ConfigCode": "0x35",
-		"EventName": "op_is_dqsosc_mrr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A Temperature Compensated Refresh(TCR) MRR command to DRAM.",
-		"ConfigCode": "0x36",
-		"EventName": "op_is_tcr_mrr",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A ZQCal Start command to DRAM.",
-		"ConfigCode": "0x37",
-		"EventName": "op_is_zqstart",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A ZQCal Latch command to DRAM.",
-		"ConfigCode": "0x38",
-		"EventName": "op_is_zqlatch",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A packet at CHI TXREQ interface (request).",
-		"ConfigCode": "0x39",
-		"EventName": "chi_txreq",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A packet at CHI TXDAT interface (read data).",
-		"ConfigCode": "0x3A",
-		"EventName": "chi_txdat",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A packet at CHI RXDAT interface (write data).",
-		"ConfigCode": "0x3B",
-		"EventName": "chi_rxdat",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A packet at CHI RXRSP interface.",
-		"ConfigCode": "0x3C",
-		"EventName": "chi_rxrsp",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "A violation detected in TZC.",
-		"ConfigCode": "0x3D",
-		"EventName": "tsz_vio",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"BriefDescription": "The ddr cycles.",
-		"ConfigCode": "0x80",
-		"EventName": "ddr_cycles",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	}
-]
diff --git a/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/metrics.json b/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/metrics.json
deleted file mode 100644
index bc865b374b6a..000000000000
--- a/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/metrics.json
+++ /dev/null
@@ -1,20 +0,0 @@
-[
-	{
-		"MetricName": "ddr_read_bandwidth.all",
-		"BriefDescription": "The ddr read bandwidth(MB/s).",
-		"MetricGroup": "ali_drw",
-		"MetricExpr": "hif_rd * 64 / 1e6 / duration_time",
-		"ScaleUnit": "1MB/s",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	},
-	{
-		"MetricName": "ddr_write_bandwidth.all",
-		"BriefDescription": "The ddr write bandwidth(MB/s).",
-		"MetricGroup": "ali_drw",
-		"MetricExpr": "(hif_wr + hif_rmw) * 64 / 1e6 / duration_time",
-		"ScaleUnit": "1MB/s",
-		"Unit": "ali_drw",
-		"Compat": "ali_drw_pmu"
-	}
-]
diff --git a/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/ali_drw.json b/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/ali_drw.json
new file mode 100644
index 000000000000..e21c469a8ef0
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/ali_drw.json
@@ -0,0 +1,373 @@
+[
+	{
+		"BriefDescription": "A Write or Read Op at HIF interface. The unit is 64B.",
+		"ConfigCode": "0x0",
+		"EventName": "hif_rd_or_wr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Write Op at HIF interface. The unit is 64B.",
+		"ConfigCode": "0x1",
+		"EventName": "hif_wr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Read Op at HIF interface. The unit is 64B.",
+		"ConfigCode": "0x2",
+		"EventName": "hif_rd",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Read-Modify-Write Op at HIF interface. The unit is 64B.",
+		"ConfigCode": "0x3",
+		"EventName": "hif_rmw",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A high priority Read at HIF interface. The unit is 64B.",
+		"ConfigCode": "0x4",
+		"EventName": "hif_hi_pri_rd",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A write data cycle at DFI interface (to DRAM).",
+		"ConfigCode": "0x7",
+		"EventName": "dfi_wr_data_cycles",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A read data cycle at DFI interface (to DRAM).",
+		"ConfigCode": "0x8",
+		"EventName": "dfi_rd_data_cycles",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A high priority read becomes critical.",
+		"ConfigCode": "0x9",
+		"EventName": "hpr_xact_when_critical",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A low priority read becomes critical.",
+		"ConfigCode": "0xA",
+		"EventName": "lpr_xact_when_critical",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A write becomes critical.",
+		"ConfigCode": "0xB",
+		"EventName": "wr_xact_when_critical",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "An Activate(ACT) command to DRAM.",
+		"ConfigCode": "0xC",
+		"EventName": "op_is_activate",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Read or Write CAS command to DRAM.",
+		"ConfigCode": "0xD",
+		"EventName": "op_is_rd_or_wr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "An Activate(ACT) command for read to DRAM.",
+		"ConfigCode": "0xE",
+		"EventName": "op_is_rd_activate",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Read CAS command to DRAM.",
+		"ConfigCode": "0xF",
+		"EventName": "op_is_rd",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Write CAS command to DRAM.",
+		"ConfigCode": "0x10",
+		"EventName": "op_is_wr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Masked Write command to DRAM.",
+		"ConfigCode": "0x11",
+		"EventName": "op_is_mwr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Precharge(PRE) command to DRAM.",
+		"ConfigCode": "0x12",
+		"EventName": "op_is_precharge",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Precharge(PRE) required by read or write.",
+		"ConfigCode": "0x13",
+		"EventName": "precharge_for_rdwr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Precharge(PRE) required by other conditions.",
+		"ConfigCode": "0x14",
+		"EventName": "precharge_for_other",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A read-write turnaround.",
+		"ConfigCode": "0x15",
+		"EventName": "rdwr_transitions",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A write combine(merge) in write data buffer.",
+		"ConfigCode": "0x16",
+		"EventName": "write_combine",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Write-After-Read hazard.",
+		"ConfigCode": "0x17",
+		"EventName": "war_hazard",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Read-After-Write hazard.",
+		"ConfigCode": "0x18",
+		"EventName": "raw_hazard",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Write-After-Write hazard.",
+		"ConfigCode": "0x19",
+		"EventName": "waw_hazard",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "Rank0 enters self-refresh(SRE).",
+		"ConfigCode": "0x1A",
+		"EventName": "op_is_enter_selfref_rk0",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "Rank1 enters self-refresh(SRE).",
+		"ConfigCode": "0x1B",
+		"EventName": "op_is_enter_selfref_rk1",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "Rank2 enters self-refresh(SRE).",
+		"ConfigCode": "0x1C",
+		"EventName": "op_is_enter_selfref_rk2",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "Rank3 enters self-refresh(SRE).",
+		"ConfigCode": "0x1D",
+		"EventName": "op_is_enter_selfref_rk3",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "Rank0 enters power-down(PDE).",
+		"ConfigCode": "0x1E",
+		"EventName": "op_is_enter_powerdown_rk0",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "Rank1 enters power-down(PDE).",
+		"ConfigCode": "0x1F",
+		"EventName": "op_is_enter_powerdown_rk1",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "Rank2 enters power-down(PDE).",
+		"ConfigCode": "0x20",
+		"EventName": "op_is_enter_powerdown_rk2",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "Rank3 enters power-down(PDE).",
+		"ConfigCode": "0x21",
+		"EventName": "op_is_enter_powerdown_rk3",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A cycle that Rank0 stays in self-refresh mode.",
+		"ConfigCode": "0x26",
+		"EventName": "selfref_mode_rk0",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A cycle that Rank1 stays in self-refresh mode.",
+		"ConfigCode": "0x27",
+		"EventName": "selfref_mode_rk1",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A cycle that Rank2 stays in self-refresh mode.",
+		"ConfigCode": "0x28",
+		"EventName": "selfref_mode_rk2",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A cycle that Rank3 stays in self-refresh mode.",
+		"ConfigCode": "0x29",
+		"EventName": "selfref_mode_rk3",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "An auto-refresh(REF) command to DRAM.",
+		"ConfigCode": "0x2A",
+		"EventName": "op_is_refresh",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A critical auto-refresh(REF) command to DRAM.",
+		"ConfigCode": "0x2B",
+		"EventName": "op_is_crit_ref",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "An MRR or MRW command to DRAM.",
+		"ConfigCode": "0x2D",
+		"EventName": "op_is_load_mode",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A ZQCal command to DRAM.",
+		"ConfigCode": "0x2E",
+		"EventName": "op_is_zqcl",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "At least one entry in read queue reaches the visible window limit.",
+		"ConfigCode": "0x30",
+		"EventName": "visible_window_limit_reached_rd",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "At least one entry in write queue reaches the visible window limit.",
+		"ConfigCode": "0x31",
+		"EventName": "visible_window_limit_reached_wr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A DQS Oscillator MPC command to DRAM.",
+		"ConfigCode": "0x34",
+		"EventName": "op_is_dqsosc_mpc",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A DQS Oscillator MRR command to DRAM.",
+		"ConfigCode": "0x35",
+		"EventName": "op_is_dqsosc_mrr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A Temperature Compensated Refresh(TCR) MRR command to DRAM.",
+		"ConfigCode": "0x36",
+		"EventName": "op_is_tcr_mrr",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A ZQCal Start command to DRAM.",
+		"ConfigCode": "0x37",
+		"EventName": "op_is_zqstart",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A ZQCal Latch command to DRAM.",
+		"ConfigCode": "0x38",
+		"EventName": "op_is_zqlatch",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A packet at CHI TXREQ interface (request).",
+		"ConfigCode": "0x39",
+		"EventName": "chi_txreq",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A packet at CHI TXDAT interface (read data).",
+		"ConfigCode": "0x3A",
+		"EventName": "chi_txdat",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A packet at CHI RXDAT interface (write data).",
+		"ConfigCode": "0x3B",
+		"EventName": "chi_rxdat",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A packet at CHI RXRSP interface.",
+		"ConfigCode": "0x3C",
+		"EventName": "chi_rxrsp",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "A violation detected in TZC.",
+		"ConfigCode": "0x3D",
+		"EventName": "tsz_vio",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"BriefDescription": "The ddr cycles.",
+		"ConfigCode": "0x80",
+		"EventName": "ddr_cycles",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	}
+]
diff --git a/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/metrics.json b/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/metrics.json
new file mode 100644
index 000000000000..bc865b374b6a
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/metrics.json
@@ -0,0 +1,20 @@
+[
+	{
+		"MetricName": "ddr_read_bandwidth.all",
+		"BriefDescription": "The ddr read bandwidth(MB/s).",
+		"MetricGroup": "ali_drw",
+		"MetricExpr": "hif_rd * 64 / 1e6 / duration_time",
+		"ScaleUnit": "1MB/s",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	},
+	{
+		"MetricName": "ddr_write_bandwidth.all",
+		"BriefDescription": "The ddr write bandwidth(MB/s).",
+		"MetricGroup": "ali_drw",
+		"MetricExpr": "(hif_wr + hif_rmw) * 64 / 1e6 / duration_time",
+		"ScaleUnit": "1MB/s",
+		"Unit": "ali_drw",
+		"Compat": "ali_drw_pmu"
+	}
+]
-- 
cgit v1.2.3


From c87826ddcefaf0b1245d1d8c61040679ea0bd387 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Tue, 6 Aug 2024 21:41:22 +0100
Subject: perf auxtrace: Use evsel__is_aux_event() for checking AUX event

Use evsel__is_aux_event() to decide if an event is a AUX event, this is
a refactoring to replace comparing the PMU type.

Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240806204130.720977-2-leo.yan@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/auxtrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index cbb773ed6f1a..ca8682966fae 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -671,11 +671,11 @@ int auxtrace_record__read_finish(struct auxtrace_record *itr, int idx)
 {
 	struct evsel *evsel;
 
-	if (!itr->evlist || !itr->pmu)
+	if (!itr->evlist)
 		return -EINVAL;
 
 	evlist__for_each_entry(itr->evlist, evsel) {
-		if (evsel->core.attr.type == itr->pmu->type) {
+		if (evsel__is_aux_event(evsel)) {
 			if (evsel->disabled)
 				return 0;
 			return evlist__enable_event_idx(itr->evlist, evsel, idx);
-- 
cgit v1.2.3


From d5726f1c8d421f000a9265601c28279e82d43db0 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Tue, 6 Aug 2024 21:41:23 +0100
Subject: perf auxtrace: Remove unused 'pmu' pointer from struct
 auxtrace_record

The 'pmu' pointer in the auxtrace_record structure is not used after
support multiple AUX events, remove it.

Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240806204130.720977-3-leo.yan@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c     | 1 -
 tools/perf/arch/arm64/util/arm-spe.c  | 1 -
 tools/perf/arch/arm64/util/hisi-ptt.c | 1 -
 tools/perf/arch/x86/util/intel-bts.c  | 1 -
 tools/perf/arch/x86/util/intel-pt.c   | 1 -
 tools/perf/util/auxtrace.h            | 1 -
 6 files changed, 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index da6231367993..96aeb7cdbee1 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -888,7 +888,6 @@ struct auxtrace_record *cs_etm_record_init(int *err)
 	}
 
 	ptr->cs_etm_pmu			= cs_etm_pmu;
-	ptr->itr.pmu			= cs_etm_pmu;
 	ptr->itr.parse_snapshot_options	= cs_etm_parse_snapshot_options;
 	ptr->itr.recording_options	= cs_etm_recording_options;
 	ptr->itr.info_priv_size		= cs_etm_info_priv_size;
diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index d59f6ca499f2..2be99fdf997d 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -514,7 +514,6 @@ struct auxtrace_record *arm_spe_recording_init(int *err,
 	}
 
 	sper->arm_spe_pmu = arm_spe_pmu;
-	sper->itr.pmu = arm_spe_pmu;
 	sper->itr.snapshot_start = arm_spe_snapshot_start;
 	sper->itr.snapshot_finish = arm_spe_snapshot_finish;
 	sper->itr.find_snapshot = arm_spe_find_snapshot;
diff --git a/tools/perf/arch/arm64/util/hisi-ptt.c b/tools/perf/arch/arm64/util/hisi-ptt.c
index ba97c8a562a0..eac9739c87e6 100644
--- a/tools/perf/arch/arm64/util/hisi-ptt.c
+++ b/tools/perf/arch/arm64/util/hisi-ptt.c
@@ -174,7 +174,6 @@ struct auxtrace_record *hisi_ptt_recording_init(int *err,
 	}
 
 	pttr->hisi_ptt_pmu = hisi_ptt_pmu;
-	pttr->itr.pmu = hisi_ptt_pmu;
 	pttr->itr.recording_options = hisi_ptt_recording_options;
 	pttr->itr.info_priv_size = hisi_ptt_info_priv_size;
 	pttr->itr.info_fill = hisi_ptt_info_fill;
diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c
index 34696f3d3d5d..85c8186300c8 100644
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -434,7 +434,6 @@ struct auxtrace_record *intel_bts_recording_init(int *err)
 	}
 
 	btsr->intel_bts_pmu = intel_bts_pmu;
-	btsr->itr.pmu = intel_bts_pmu;
 	btsr->itr.recording_options = intel_bts_recording_options;
 	btsr->itr.info_priv_size = intel_bts_info_priv_size;
 	btsr->itr.info_fill = intel_bts_info_fill;
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index 4b710e875953..ea510a7486b1 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -1197,7 +1197,6 @@ struct auxtrace_record *intel_pt_recording_init(int *err)
 	}
 
 	ptr->intel_pt_pmu = intel_pt_pmu;
-	ptr->itr.pmu = intel_pt_pmu;
 	ptr->itr.recording_options = intel_pt_recording_options;
 	ptr->itr.info_priv_size = intel_pt_info_priv_size;
 	ptr->itr.info_fill = intel_pt_info_fill;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index d405efcd8708..a1895a4f530b 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -411,7 +411,6 @@ struct auxtrace_record {
 	int (*read_finish)(struct auxtrace_record *itr, int idx);
 	unsigned int alignment;
 	unsigned int default_aux_sample_size;
-	struct perf_pmu *pmu;
 	struct evlist *evlist;
 };
 
-- 
cgit v1.2.3


From 6f87543c74ddc9649cfd9a6e6908821cdb4d397a Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Fri, 9 Aug 2024 10:54:22 +0100
Subject: perf test trace_btf_enum: Fix shellcheck warning

Shellcheck versions < v0.7.2 can't follow this path so add the helper to
fix the following warning:

  In tests/shell/trace_btf_enum.sh line 13:
  . "$(dirname $0)"/lib/probe.sh
  ^--------------------------^ SC1090: Can't follow non-constant source.
  Use a directive to specify location.

Fixes: d66763fed30f0bd8 ("perf test trace_btf_enum: Add regression test for the BTF augmentation of enums in 'perf trace'")
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240809095426.3065163-1-james.clark@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/trace_btf_enum.sh | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/trace_btf_enum.sh b/tools/perf/tests/shell/trace_btf_enum.sh
index 7d407b52bea5..5a3b8a5a9b5c 100755
--- a/tools/perf/tests/shell/trace_btf_enum.sh
+++ b/tools/perf/tests/shell/trace_btf_enum.sh
@@ -10,6 +10,7 @@ non_syscall="timer:hrtimer_init,timer:hrtimer_start"
 
 TESTPROG="perf test -w landlock"
 
+# shellcheck source=lib/probe.sh
 . "$(dirname $0)"/lib/probe.sh
 skip_if_no_perf_trace || exit 2
 
-- 
cgit v1.2.3


From 9af2efee41b27a0f386fb5aa95d8d0b4b5d9fede Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 26 Aug 2024 15:10:42 -0700
Subject: perf report: Fix segfault when 'sym' sort key is not used

The fields in the hist_entry are filled on-demand which means they only
have meaningful values when relevant sort keys are used.

So if neither of 'dso' nor 'sym' sort keys are used, the map/symbols in
the hist entry can be garbage.  So it shouldn't access it
unconditionally.

I got a segfault, when I wanted to see cgroup profiles.

  $ sudo perf record -a --all-cgroups --synth=cgroup true

  $ sudo perf report -s cgroup

  Program received signal SIGSEGV, Segmentation fault.
  0x00005555557a8d90 in map__dso (map=0x0) at util/map.h:48
  48		return RC_CHK_ACCESS(map)->dso;
  (gdb) bt
  #0  0x00005555557a8d90 in map__dso (map=0x0) at util/map.h:48
  #1  0x00005555557aa39b in map__load (map=0x0) at util/map.c:344
  #2  0x00005555557aa592 in map__find_symbol (map=0x0, addr=140736115941088) at util/map.c:385
  #3  0x00005555557ef000 in hists__findnew_entry (hists=0x555556039d60, entry=0x7fffffffa4c0, al=0x7fffffffa8c0, sample_self=true)
      at util/hist.c:644
  #4  0x00005555557ef61c in __hists__add_entry (hists=0x555556039d60, al=0x7fffffffa8c0, sym_parent=0x0, bi=0x0, mi=0x0, ki=0x0,
      block_info=0x0, sample=0x7fffffffaa90, sample_self=true, ops=0x0) at util/hist.c:761
  #5  0x00005555557ef71f in hists__add_entry (hists=0x555556039d60, al=0x7fffffffa8c0, sym_parent=0x0, bi=0x0, mi=0x0, ki=0x0,
      sample=0x7fffffffaa90, sample_self=true) at util/hist.c:779
  #6  0x00005555557f00fb in iter_add_single_normal_entry (iter=0x7fffffffa900, al=0x7fffffffa8c0) at util/hist.c:1015
  #7  0x00005555557f09a7 in hist_entry_iter__add (iter=0x7fffffffa900, al=0x7fffffffa8c0, max_stack_depth=127, arg=0x7fffffffbce0)
      at util/hist.c:1260
  #8  0x00005555555ba7ce in process_sample_event (tool=0x7fffffffbce0, event=0x7ffff7c14128, sample=0x7fffffffaa90, evsel=0x555556039ad0,
      machine=0x5555560388e8) at builtin-report.c:334
  #9  0x00005555557b30c8 in evlist__deliver_sample (evlist=0x555556039010, tool=0x7fffffffbce0, event=0x7ffff7c14128,
      sample=0x7fffffffaa90, evsel=0x555556039ad0, machine=0x5555560388e8) at util/session.c:1232
  #10 0x00005555557b32bc in machines__deliver_event (machines=0x5555560388e8, evlist=0x555556039010, event=0x7ffff7c14128,
      sample=0x7fffffffaa90, tool=0x7fffffffbce0, file_offset=110888, file_path=0x555556038ff0 "perf.data") at util/session.c:1271
  #11 0x00005555557b3848 in perf_session__deliver_event (session=0x5555560386d0, event=0x7ffff7c14128, tool=0x7fffffffbce0,
      file_offset=110888, file_path=0x555556038ff0 "perf.data") at util/session.c:1354
  #12 0x00005555557affaf in ordered_events__deliver_event (oe=0x555556038e60, event=0x555556135aa0) at util/session.c:132
  #13 0x00005555557bb605 in do_flush (oe=0x555556038e60, show_progress=false) at util/ordered-events.c:245
  #14 0x00005555557bb95c in __ordered_events__flush (oe=0x555556038e60, how=OE_FLUSH__ROUND, timestamp=0) at util/ordered-events.c:324
  #15 0x00005555557bba46 in ordered_events__flush (oe=0x555556038e60, how=OE_FLUSH__ROUND) at util/ordered-events.c:342
  #16 0x00005555557b1b3b in perf_event__process_finished_round (tool=0x7fffffffbce0, event=0x7ffff7c15bb8, oe=0x555556038e60)
      at util/session.c:780
  #17 0x00005555557b3b27 in perf_session__process_user_event (session=0x5555560386d0, event=0x7ffff7c15bb8, file_offset=117688,
      file_path=0x555556038ff0 "perf.data") at util/session.c:1406

As you can see the entry->ms.map was NULL even if he->ms.map has a
value.  This is because 'sym' sort key is not given, so it cannot assume
whether he->ms.sym and entry->ms.sym is the same.  I only checked the
'sym' sort key here as it implies 'dso' behavior (so maps are the same).

Fixes: ac01c8c4246546fd ("perf hist: Update hist symbol when updating maps")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Matt Fleming <matt@readmodwrite.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240826221045.1202305-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index dadce2889e52..f387e85a0087 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -638,7 +638,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 			 * mis-adjust symbol addresses when computing
 			 * the history counter to increment.
 			 */
-			if (he->ms.map != entry->ms.map) {
+			if (hists__has(hists, sym) && he->ms.map != entry->ms.map) {
 				if (he->ms.sym) {
 					u64 addr = he->ms.sym->start;
 					he->ms.sym = map__find_symbol(entry->ms.map, addr);
-- 
cgit v1.2.3


From 591156f25f6bc71b143d4a7c7cbe7cdb68dda94e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 26 Aug 2024 15:10:43 -0700
Subject: perf bpf-filter: Add build dependency to header files

The flex and bison files need to be recompiled when one of these header
filters are changed.

 * util/bpf-filter.h
 * util/bpf_skel/sample-filter.h

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240826221045.1202305-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index b87f918bdfe7..260cec2f6c0b 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -278,12 +278,12 @@ $(OUTPUT)util/pmu-bison.c $(OUTPUT)util/pmu-bison.h: util/pmu.y
 	$(Q)$(call echo-cmd,bison)$(BISON) -v $< -d $(PARSER_DEBUG_BISON) $(BISON_FILE_PREFIX_MAP) \
 		-o $(OUTPUT)util/pmu-bison.c -p perf_pmu_
 
-$(OUTPUT)util/bpf-filter-flex.c $(OUTPUT)util/bpf-filter-flex.h: util/bpf-filter.l $(OUTPUT)util/bpf-filter-bison.c
+$(OUTPUT)util/bpf-filter-flex.c $(OUTPUT)util/bpf-filter-flex.h: util/bpf-filter.l $(OUTPUT)util/bpf-filter-bison.c util/bpf-filter.h util/bpf_skel/sample-filter.h
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,flex)$(FLEX) -o $(OUTPUT)util/bpf-filter-flex.c \
 		--header-file=$(OUTPUT)util/bpf-filter-flex.h $(PARSER_DEBUG_FLEX) $<
 
-$(OUTPUT)util/bpf-filter-bison.c $(OUTPUT)util/bpf-filter-bison.h: util/bpf-filter.y
+$(OUTPUT)util/bpf-filter-bison.c $(OUTPUT)util/bpf-filter-bison.h: util/bpf-filter.y util/bpf-filter.h util/bpf_skel/sample-filter.h
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,bison)$(BISON) -v $< -d $(PARSER_DEBUG_BISON) $(BISON_FILE_PREFIX_MAP) \
 		-o $(OUTPUT)util/bpf-filter-bison.c -p perf_bpf_filter_
-- 
cgit v1.2.3


From 91e88437d5156b209b1d69b69b560f0a02b80712 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 26 Aug 2024 15:10:44 -0700
Subject: perf bpf-filter: Support filtering on cgroups

The new cgroup filter can take either of '==' or '!=' operator and a
pathname for the target cgroup.

  $ perf record -a --all-cgroups -e cycles --filter 'cgroup == /abc/def' -- sleep 1

Users should have --all-cgroups option in the command line to enable
cgroup filtering.  Technically it doesn't need to have the option as
it can get the current task's cgroup info directly from BPF.  But I want
to follow the convention for the other sample info.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240826221045.1202305-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-filter.c                 |  1 +
 tools/perf/util/bpf-filter.l                 | 28 +++++++++++++++++++++++-----
 tools/perf/util/bpf-filter.y                 | 28 ++++++++++++++++++++++++++--
 tools/perf/util/bpf_skel/sample-filter.h     |  2 +-
 tools/perf/util/bpf_skel/sample_filter.bpf.c |  4 +++-
 tools/perf/util/bpf_skel/vmlinux/vmlinux.h   |  1 +
 6 files changed, 55 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index 0a1832564dd2..e87b6789eb9e 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -100,6 +100,7 @@ static const struct perf_sample_info {
 	PERF_SAMPLE_TYPE(TRANSACTION, "--transaction"),
 	PERF_SAMPLE_TYPE(CODE_PAGE_SIZE, "--code-page-size"),
 	PERF_SAMPLE_TYPE(DATA_PAGE_SIZE, "--data-page-size"),
+	PERF_SAMPLE_TYPE(CGROUP, "--all-cgroups"),
 };
 
 static int get_pinned_fd(const char *name);
diff --git a/tools/perf/util/bpf-filter.l b/tools/perf/util/bpf-filter.l
index 2a7c839f3fae..f313404f95a9 100644
--- a/tools/perf/util/bpf-filter.l
+++ b/tools/perf/util/bpf-filter.l
@@ -9,8 +9,11 @@
 #include "bpf-filter.h"
 #include "bpf-filter-bison.h"
 
+extern int perf_bpf_filter_needs_path;
+
 static int sample(enum perf_bpf_filter_term term)
 {
+	perf_bpf_filter_needs_path = 0;
 	perf_bpf_filter_lval.sample.term = term;
 	perf_bpf_filter_lval.sample.part = 0;
 	return BFT_SAMPLE;
@@ -18,11 +21,20 @@ static int sample(enum perf_bpf_filter_term term)
 
 static int sample_part(enum perf_bpf_filter_term term, int part)
 {
+	perf_bpf_filter_needs_path = 0;
 	perf_bpf_filter_lval.sample.term = term;
 	perf_bpf_filter_lval.sample.part = part;
 	return BFT_SAMPLE;
 }
 
+static int sample_path(enum perf_bpf_filter_term term)
+{
+	perf_bpf_filter_needs_path = 1;
+	perf_bpf_filter_lval.sample.term = term;
+	perf_bpf_filter_lval.sample.part = 0;
+	return BFT_SAMPLE_PATH;
+}
+
 static int operator(enum perf_bpf_filter_op op)
 {
 	perf_bpf_filter_lval.op = op;
@@ -48,10 +60,15 @@ static int constant(int val)
 	return BFT_NUM;
 }
 
-static int error(const char *str)
+static int path_or_error(void)
 {
-	printf("perf_bpf_filter: Unexpected filter %s: %s\n", str, perf_bpf_filter_text);
-	return BFT_ERROR;
+	if (!perf_bpf_filter_needs_path) {
+		printf("perf_bpf_filter: Error: Unexpected item: %s\n",
+			perf_bpf_filter_text);
+		return BFT_ERROR;
+	}
+	perf_bpf_filter_lval.path = perf_bpf_filter_text;
+	return BFT_PATH;
 }
 
 %}
@@ -59,6 +76,7 @@ static int error(const char *str)
 num_dec		[0-9]+
 num_hex		0[Xx][0-9a-fA-F]+
 space		[ \t]+
+path		[^ \t\n]+
 ident		[_a-zA-Z][_a-zA-Z0-9]+
 
 %%
@@ -97,6 +115,7 @@ mem_blk		{ return sample_part(PBF_TERM_DATA_SRC, 7); }
 mem_hops	{ return sample_part(PBF_TERM_DATA_SRC, 8); }
 uid		{ return sample(PBF_TERM_UID); }
 gid		{ return sample(PBF_TERM_GID); }
+cgroup		{ return sample_path(PBF_TERM_CGROUP); }
 
 "=="		{ return operator(PBF_OP_EQ); }
 "!="		{ return operator(PBF_OP_NEQ); }
@@ -155,7 +174,6 @@ hops3		{ return constant(PERF_MEM_HOPS_3); }
 ","		{ return ','; }
 "||"		{ return BFT_LOGICAL_OR; }
 
-{ident}		{ return error("ident"); }
-.		{ return error("input"); }
+{path}		{ return path_or_error(); }
 
 %%
diff --git a/tools/perf/util/bpf-filter.y b/tools/perf/util/bpf-filter.y
index 0c56fccb8874..5a79a8e7a45b 100644
--- a/tools/perf/util/bpf-filter.y
+++ b/tools/perf/util/bpf-filter.y
@@ -12,9 +12,13 @@
 #include <linux/compiler.h>
 #include <linux/list.h>
 #include "bpf-filter.h"
+#include "cgroup.h"
 
 int perf_bpf_filter_lex(void);
 
+/* To indicate if the current term needs a pathname or not */
+int perf_bpf_filter_needs_path;
+
 static void perf_bpf_filter_error(struct list_head *expr __maybe_unused,
 				  char const *msg)
 {
@@ -26,6 +30,7 @@ static void perf_bpf_filter_error(struct list_head *expr __maybe_unused,
 %union
 {
 	unsigned long num;
+	char *path;
 	struct {
 		enum perf_bpf_filter_term term;
 		int part;
@@ -34,12 +39,13 @@ static void perf_bpf_filter_error(struct list_head *expr __maybe_unused,
 	struct perf_bpf_filter_expr *expr;
 }
 
-%token BFT_SAMPLE BFT_OP BFT_ERROR BFT_NUM BFT_LOGICAL_OR
+%token BFT_SAMPLE BFT_SAMPLE_PATH BFT_OP BFT_ERROR BFT_NUM BFT_LOGICAL_OR BFT_PATH
 %type <expr> filter_term filter_expr
 %destructor { free ($$); } <expr>
-%type <sample> BFT_SAMPLE
+%type <sample> BFT_SAMPLE BFT_SAMPLE_PATH
 %type <op> BFT_OP
 %type <num> BFT_NUM
+%type <path> BFT_PATH
 
 %%
 
@@ -81,5 +87,23 @@ BFT_SAMPLE BFT_OP BFT_NUM
 {
 	$$ = perf_bpf_filter_expr__new($1.term, $1.part, $2, $3);
 }
+|
+BFT_SAMPLE_PATH BFT_OP BFT_PATH
+{
+	struct cgroup *cgrp;
+	unsigned long cgroup_id = 0;
+
+	if ($2 != PBF_OP_EQ && $2 != PBF_OP_NEQ) {
+		printf("perf_bpf_filter: cgroup accepts '==' or '!=' only\n");
+		YYERROR;
+	}
+
+	cgrp = cgroup__new($3, /*do_open=*/false);
+	if (cgrp && read_cgroup_id(cgrp) == 0)
+		cgroup_id = cgrp->id;
+
+	$$ = perf_bpf_filter_expr__new($1.term, $1.part, $2, cgroup_id);
+	cgroup__put(cgrp);
+}
 
 %%
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
index 5f0c8e4e83d3..683fec85e71e 100644
--- a/tools/perf/util/bpf_skel/sample-filter.h
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -45,7 +45,7 @@ enum perf_bpf_filter_term {
 	__PBF_UNUSED_TERM18	= PBF_TERM_SAMPLE_START + 18, /* SAMPLE_REGS_INTR = 1U << 18 */
 	PBF_TERM_PHYS_ADDR	= PBF_TERM_SAMPLE_START + 19, /* SAMPLE_PHYS_ADDR = 1U << 19 */
 	__PBF_UNUSED_TERM20	= PBF_TERM_SAMPLE_START + 20, /* SAMPLE_AUX = 1U << 20 */
-	__PBF_UNUSED_TERM21	= PBF_TERM_SAMPLE_START + 21, /* SAMPLE_CGROUP = 1U << 21 */
+	PBF_TERM_CGROUP		= PBF_TERM_SAMPLE_START + 21, /* SAMPLE_CGROUP = 1U << 21 */
 	PBF_TERM_DATA_PAGE_SIZE	= PBF_TERM_SAMPLE_START + 22, /* SAMPLE_DATA_PAGE_SIZE = 1U << 22 */
 	PBF_TERM_CODE_PAGE_SIZE	= PBF_TERM_SAMPLE_START + 23, /* SAMPLE_CODE_PAGE_SIZE = 1U << 23 */
 	PBF_TERM_WEIGHT_STRUCT	= PBF_TERM_SAMPLE_START + 24, /* SAMPLE_WEIGHT_STRUCT = 1U << 24 */
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index 4872a16eedfd..b195e6efeb8b 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -93,6 +93,7 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
 	BUILD_CHECK_SAMPLE(DATA_SRC);
 	BUILD_CHECK_SAMPLE(TRANSACTION);
 	BUILD_CHECK_SAMPLE(PHYS_ADDR);
+	BUILD_CHECK_SAMPLE(CGROUP);
 	BUILD_CHECK_SAMPLE(DATA_PAGE_SIZE);
 	BUILD_CHECK_SAMPLE(CODE_PAGE_SIZE);
 	BUILD_CHECK_SAMPLE(WEIGHT_STRUCT);
@@ -135,6 +136,8 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
 		return kctx->data->weight.full;
 	case PBF_TERM_PHYS_ADDR:
 		return kctx->data->phys_addr;
+	case PBF_TERM_CGROUP:
+		return kctx->data->cgroup;
 	case PBF_TERM_CODE_PAGE_SIZE:
 		return kctx->data->code_page_size;
 	case PBF_TERM_DATA_PAGE_SIZE:
@@ -183,7 +186,6 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
 	case __PBF_UNUSED_TERM16:
 	case __PBF_UNUSED_TERM18:
 	case __PBF_UNUSED_TERM20:
-	case __PBF_UNUSED_TERM21:
 	default:
 		break;
 	}
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
index 4fa21468487e..4dcad7b682bd 100644
--- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -171,6 +171,7 @@ struct perf_sample_data {
 		u32		 cpu;
 	} cpu_entry;
 	u64			 phys_addr;
+	u64			 cgroup;
 	u64			 data_page_size;
 	u64			 code_page_size;
 } __attribute__((__aligned__(64))) __attribute__((preserve_access_index));
-- 
cgit v1.2.3


From d56a4d56a25c6aa76d816f1ee1888d38cf654004 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 26 Aug 2024 15:10:45 -0700
Subject: perf test: Add 'perf record cgroup' filtering test

  $ sudo ./perf test filtering -vv
   96: perf record sample filtering (by BPF) tests:
  --- start ---
  test child forked, pid 2966908
  Checking BPF-filter privilege
  Basic bpf-filter test
  Basic bpf-filter test [Success]
  Failing bpf-filter test
  Failing bpf-filter test [Success]
  Group bpf-filter test
  Group bpf-filter test [Success]
  Multiple bpf-filter test
  Multiple bpf-filter test [Success]
  Cgroup bpf-filter test
  Cgroup bpf-filter test [Success]
  ---- end(0) ----
   96: perf record sample filtering (by BPF) tests                     : Ok

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20240826221045.1202305-5-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record_bpf_filter.sh | 39 ++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record_bpf_filter.sh b/tools/perf/tests/shell/record_bpf_filter.sh
index 4df33c15bfa7..1b58ccc1fd88 100755
--- a/tools/perf/tests/shell/record_bpf_filter.sh
+++ b/tools/perf/tests/shell/record_bpf_filter.sh
@@ -68,7 +68,7 @@ test_bpf_filter_fail() {
 
   # 'cpu' requires PERF_SAMPLE_CPU flag
   if ! perf record -e task-clock --filter 'cpu > 0' \
-	  -o /dev/null true 2>&1 | grep PERF_SAMPLE_CPU
+	  -o /dev/null true 2>&1 | grep -q PERF_SAMPLE_CPU
   then
     echo "Failing bpf-filter test [Failed forbidden CPU]"
     err=1
@@ -98,7 +98,7 @@ test_bpf_filter_group() {
   fi
 
   if ! perf record -e task-clock --filter 'cpu > 0 || ip > 0' \
-	  -o /dev/null true 2>&1 | grep PERF_SAMPLE_CPU
+	  -o /dev/null true 2>&1 | grep -q PERF_SAMPLE_CPU
   then
     echo "Group bpf-filter test [Failed forbidden CPU]"
     err=1
@@ -106,7 +106,7 @@ test_bpf_filter_group() {
   fi
 
   if ! perf record -e task-clock --filter 'period > 0 || code_pgsz > 4096' \
-	  -o /dev/null true 2>&1 | grep PERF_SAMPLE_CODE_PAGE_SIZE
+	  -o /dev/null true 2>&1 | grep -q PERF_SAMPLE_CODE_PAGE_SIZE
   then
     echo "Group bpf-filter test [Failed forbidden CODE_PAGE_SIZE]"
     err=1
@@ -147,6 +147,35 @@ test_bpf_filter_multi() {
   echo "Multiple bpf-filter test [Success]"
 }
 
+test_bpf_filter_cgroup() {
+  echo "Cgroup bpf-filter test"
+
+  if ! perf record -e task-clock --filter 'cgroup == /' \
+       -a --all-cgroups --synth=cgroup -o "${perfdata}" true 2> /dev/null
+  then
+    echo "Cgroup bpf-filter test [Skipped cgroup not supported]"
+    return
+  fi
+
+  # 'cgroup' requires PERF_SAMPLE_CGROUP flag
+  if ! perf record -e task-clock --filter 'cgroup == /' \
+	  -o /dev/null true 2>&1 | grep -q PERF_SAMPLE_CGROUP
+  then
+    echo "Cgroup bpf-filter test [Failed CGROUP requires --all-cgroups]"
+    err=1
+    return
+  fi
+
+  if ! perf report -i "${perfdata}" -s cgroup -q | grep -q -F '100.00%'
+  then
+    echo "Cgroup bpf-filter test [Failed root cgroup does not have 100%]"
+    err=1
+    return
+  fi
+
+  echo "Cgroup bpf-filter test [Success]"
+}
+
 test_bpf_filter_priv
 
 if [ $err = 0 ]; then
@@ -165,5 +194,9 @@ if [ $err = 0 ]; then
   test_bpf_filter_multi
 fi
 
+if [ $err = 0 ]; then
+  test_bpf_filter_cgroup
+fi
+
 cleanup
 exit $err
-- 
cgit v1.2.3


From 47b3b6435e4bfb61ae8ffc63a11bd3c310f69acf Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 28 Aug 2024 19:06:47 -0300
Subject: tools build: Remove leftover libcap tests that prevents fast path
 feature detection from working

I noticed that the fast path feature detection was failing:

  $ cat /tmp/build/perf-tools-next/feature/test-all.make.output
  /usr/bin/ld: cannot find -lcap: No such file or directory
  collect2: error: ld returned 1 exit status
  $

The patch removing the dependency (Fixes tag below) didn't remove the
detection of libcap, and as the fast path feature detection (test-all.c)
had -lcap in its Makefile link list of libraries to link, it was failing
when libcap-devel is not available, fix it by removing those leftover
files.

Fixes: e25ebda78e230283 ("perf cap: Tidy up and improve capability testing")
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/Zs-gjOGFWtAvIZit@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/feature/Makefile      |  6 +-----
 tools/build/feature/test-libcap.c | 20 --------------------
 2 files changed, 1 insertion(+), 25 deletions(-)
 delete mode 100644 tools/build/feature/test-libcap.c

(limited to 'tools')

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 12796808f07a..b873fc58804e 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -25,7 +25,6 @@ FILES=                                          \
          test-libbfd-liberty-z.bin              \
          test-cplus-demangle.bin                \
          test-cxa-demangle.bin                  \
-         test-libcap.bin			\
          test-libelf.bin                        \
          test-libelf-getphdrnum.bin             \
          test-libelf-gelf_getnote.bin           \
@@ -112,7 +111,7 @@ all: $(FILES)
 __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
   BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
   BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl
-  BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd -lcap
+  BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd
 
 __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
   BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
@@ -140,9 +139,6 @@ $(OUTPUT)test-fortify-source.bin:
 $(OUTPUT)test-bionic.bin:
 	$(BUILD)
 
-$(OUTPUT)test-libcap.bin:
-	$(BUILD) -lcap
-
 $(OUTPUT)test-libelf.bin:
 	$(BUILD) -lelf
 
diff --git a/tools/build/feature/test-libcap.c b/tools/build/feature/test-libcap.c
deleted file mode 100644
index d2a2e152195f..000000000000
--- a/tools/build/feature/test-libcap.c
+++ /dev/null
@@ -1,20 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <sys/capability.h>
-#include <linux/capability.h>
-
-int main(void)
-{
-	cap_flag_value_t val;
-	cap_t caps = cap_get_proc();
-
-	if (!caps)
-		return 1;
-
-	if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val) != 0)
-		return 1;
-
-	if (cap_free(caps) != 0)
-		return 1;
-
-	return 0;
-}
-- 
cgit v1.2.3


From 6db59c4935c9354f4c33fc63287d110317ca4d70 Mon Sep 17 00:00:00 2001
From: Juntong Deng <juntong.deng@outlook.com>
Date: Wed, 28 Aug 2024 20:51:32 +0100
Subject: selftests/bpf: Add test for zero offset or non-zero offset pointers
 as KF_ACQUIRE kfuncs argument

This patch adds test cases for zero offset (implicit cast) or non-zero
offset pointer as KF_ACQUIRE kfuncs argument. Currently KF_ACQUIRE
kfuncs should support passing in pointers like &sk->sk_write_queue
(non-zero offset) or &sk->__sk_common (zero offset) and not be rejected
by the verifier.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
Link: https://lore.kernel.org/r/AM6PR03MB5848CB6F0D4D9068669A905B99952@AM6PR03MB5848.eurprd03.prod.outlook.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        | 17 +++++++++++
 .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h  |  4 +++
 .../selftests/bpf/prog_tests/nested_trust.c        |  4 +++
 tools/testing/selftests/bpf/progs/nested_acquire.c | 33 ++++++++++++++++++++++
 4 files changed, 58 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/nested_acquire.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index c04b7dec2ab9..8a71a91b752d 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -183,6 +183,20 @@ __bpf_kfunc void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr,
 {
 }
 
+__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr)
+{
+	return NULL;
+}
+
+__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr)
+{
+	return NULL;
+}
+
+__bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr)
+{
+}
+
 __bpf_kfunc struct bpf_testmod_ctx *
 bpf_testmod_ctx_create(int *err)
 {
@@ -541,6 +555,9 @@ BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value)
 BTF_ID_FLAGS(func, bpf_kfunc_common_test)
 BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test)
+BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE)
 BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
index e587a79f2239..c6c314965bb1 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
@@ -144,4 +144,8 @@ void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr, struct bpf_dynptr *ptr__nulla
 struct bpf_testmod_ctx *bpf_testmod_ctx_create(int *err) __ksym;
 void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) __ksym;
 
+struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr) __ksym;
+struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) __ksym;
+void bpf_kfunc_nested_release_test(struct sk_buff *ptr) __ksym;
+
 #endif /* _BPF_TESTMOD_KFUNC_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/nested_trust.c b/tools/testing/selftests/bpf/prog_tests/nested_trust.c
index 39886f58924e..54a112ad5f9c 100644
--- a/tools/testing/selftests/bpf/prog_tests/nested_trust.c
+++ b/tools/testing/selftests/bpf/prog_tests/nested_trust.c
@@ -4,9 +4,13 @@
 #include <test_progs.h>
 #include "nested_trust_failure.skel.h"
 #include "nested_trust_success.skel.h"
+#include "nested_acquire.skel.h"
 
 void test_nested_trust(void)
 {
 	RUN_TESTS(nested_trust_success);
 	RUN_TESTS(nested_trust_failure);
+
+	if (env.has_testmod)
+		RUN_TESTS(nested_acquire);
 }
diff --git a/tools/testing/selftests/bpf/progs/nested_acquire.c b/tools/testing/selftests/bpf/progs/nested_acquire.c
new file mode 100644
index 000000000000..8e521a21d995
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/nested_acquire.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("tp_btf/tcp_probe")
+__success
+int BPF_PROG(test_nested_acquire_nonzero, struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *ptr;
+
+	ptr = bpf_kfunc_nested_acquire_nonzero_offset_test(&sk->sk_write_queue);
+
+	bpf_kfunc_nested_release_test(ptr);
+	return 0;
+}
+
+SEC("tp_btf/tcp_probe")
+__success
+int BPF_PROG(test_nested_acquire_zero, struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *ptr;
+
+	ptr = bpf_kfunc_nested_acquire_zero_offset_test(&sk->__sk_common);
+
+	bpf_kfunc_nested_release_test(ptr);
+	return 0;
+}
-- 
cgit v1.2.3


From 0a8b08c554dabea952a75363c89050b1fbcbfffb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Aug 2024 11:00:12 +0200
Subject: selftests: netfilter: nft_queue.sh: reduce test file size for debug
 build

The sctp selftest is very slow on debug kernels.

Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20240826192500.32efa22c@kernel.org/
Fixes: 4e97d521c2be ("selftests: netfilter: nft_queue.sh: sctp coverage")
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Link: https://patch.msgid.link/20240827090023.8917-1-fw@strlen.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/netfilter/nft_queue.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
index f3bdeb1271eb..9e5f423bff09 100755
--- a/tools/testing/selftests/net/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -39,7 +39,9 @@ TMPFILE2=$(mktemp)
 TMPFILE3=$(mktemp)
 
 TMPINPUT=$(mktemp)
-dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT"
+COUNT=200
+[ "$KSFT_MACHINE_SLOW" = "yes" ] && COUNT=25
+dd conv=sparse status=none if=/dev/zero bs=1M count=$COUNT of="$TMPINPUT"
 
 if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
     echo "SKIP: No virtual ethernet pair device support in kernel"
-- 
cgit v1.2.3


From 375e9bde9fc0fdb9e8d2339b06be5c2c2a5cb701 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 27 Aug 2024 17:19:30 +0900
Subject: selftests/hid: extract the utility part of hid_bpf.c into its own
 header

When adding new tests programs, we need the same mechanics to create
new virtual devices, and read from their matching hidraw node.

Extract the common part into its own header so we can easily add new
tests C-files.

Link: https://patch.msgid.link/20240827-hidraw-revoke-v5-2-d004a7451aea@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 tools/testing/selftests/hid/hid_bpf.c    | 437 +------------------------------
 tools/testing/selftests/hid/hid_common.h | 436 ++++++++++++++++++++++++++++++
 2 files changed, 438 insertions(+), 435 deletions(-)
 create mode 100644 tools/testing/selftests/hid/hid_common.h

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 75b7b4ef6cfa..d10cf6883683 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -1,93 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2022 Red Hat */
+/* Copyright (c) 2022-2024 Red Hat */
 #include "hid.skel.h"
-
-#include "../kselftest_harness.h"
-
+#include "hid_common.h"
 #include <bpf/bpf.h>
-#include <fcntl.h>
-#include <fnmatch.h>
-#include <dirent.h>
-#include <poll.h>
-#include <pthread.h>
-#include <stdbool.h>
-#include <linux/hidraw.h>
-#include <linux/uhid.h>
-
-#define SHOW_UHID_DEBUG 0
-
-#define min(a, b) \
-	({ __typeof__(a) _a = (a); \
-	__typeof__(b) _b = (b); \
-	_a < _b ? _a : _b; })
-
-static unsigned char rdesc[] = {
-	0x06, 0x00, 0xff,	/* Usage Page (Vendor Defined Page 1) */
-	0x09, 0x21,		/* Usage (Vendor Usage 0x21) */
-	0xa1, 0x01,		/* COLLECTION (Application) */
-	0x09, 0x01,			/* Usage (Vendor Usage 0x01) */
-	0xa1, 0x00,			/* COLLECTION (Physical) */
-	0x85, 0x02,				/* REPORT_ID (2) */
-	0x19, 0x01,				/* USAGE_MINIMUM (1) */
-	0x29, 0x08,				/* USAGE_MAXIMUM (3) */
-	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
-	0x25, 0xff,				/* LOGICAL_MAXIMUM (255) */
-	0x95, 0x08,				/* REPORT_COUNT (8) */
-	0x75, 0x08,				/* REPORT_SIZE (8) */
-	0x81, 0x02,				/* INPUT (Data,Var,Abs) */
-	0xc0,				/* END_COLLECTION */
-	0x09, 0x01,			/* Usage (Vendor Usage 0x01) */
-	0xa1, 0x00,			/* COLLECTION (Physical) */
-	0x85, 0x01,				/* REPORT_ID (1) */
-	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
-	0x19, 0x01,				/* USAGE_MINIMUM (1) */
-	0x29, 0x03,				/* USAGE_MAXIMUM (3) */
-	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
-	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
-	0x95, 0x03,				/* REPORT_COUNT (3) */
-	0x75, 0x01,				/* REPORT_SIZE (1) */
-	0x81, 0x02,				/* INPUT (Data,Var,Abs) */
-	0x95, 0x01,				/* REPORT_COUNT (1) */
-	0x75, 0x05,				/* REPORT_SIZE (5) */
-	0x81, 0x01,				/* INPUT (Cnst,Var,Abs) */
-	0x05, 0x01,				/* USAGE_PAGE (Generic Desktop) */
-	0x09, 0x30,				/* USAGE (X) */
-	0x09, 0x31,				/* USAGE (Y) */
-	0x15, 0x81,				/* LOGICAL_MINIMUM (-127) */
-	0x25, 0x7f,				/* LOGICAL_MAXIMUM (127) */
-	0x75, 0x10,				/* REPORT_SIZE (16) */
-	0x95, 0x02,				/* REPORT_COUNT (2) */
-	0x81, 0x06,				/* INPUT (Data,Var,Rel) */
-
-	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
-	0x19, 0x01,				/* USAGE_MINIMUM (1) */
-	0x29, 0x03,				/* USAGE_MAXIMUM (3) */
-	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
-	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
-	0x95, 0x03,				/* REPORT_COUNT (3) */
-	0x75, 0x01,				/* REPORT_SIZE (1) */
-	0x91, 0x02,				/* Output (Data,Var,Abs) */
-	0x95, 0x01,				/* REPORT_COUNT (1) */
-	0x75, 0x05,				/* REPORT_SIZE (5) */
-	0x91, 0x01,				/* Output (Cnst,Var,Abs) */
-
-	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
-	0x19, 0x06,				/* USAGE_MINIMUM (6) */
-	0x29, 0x08,				/* USAGE_MAXIMUM (8) */
-	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
-	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
-	0x95, 0x03,				/* REPORT_COUNT (3) */
-	0x75, 0x01,				/* REPORT_SIZE (1) */
-	0xb1, 0x02,				/* Feature (Data,Var,Abs) */
-	0x95, 0x01,				/* REPORT_COUNT (1) */
-	0x75, 0x05,				/* REPORT_SIZE (5) */
-	0x91, 0x01,				/* Output (Cnst,Var,Abs) */
-
-	0xc0,				/* END_COLLECTION */
-	0xc0,			/* END_COLLECTION */
-};
-
-static __u8 feature_data[] = { 1, 2 };
 
 struct attach_prog_args {
 	int prog_fd;
@@ -105,354 +20,6 @@ struct hid_hw_request_syscall_args {
 	__u8 request_type;
 };
 
-#define ASSERT_OK(data) ASSERT_FALSE(data)
-#define ASSERT_OK_PTR(ptr) ASSERT_NE(NULL, ptr)
-
-#define UHID_LOG(fmt, ...) do { \
-	if (SHOW_UHID_DEBUG) \
-		TH_LOG(fmt, ##__VA_ARGS__); \
-} while (0)
-
-static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER;
-
-static pthread_mutex_t uhid_output_mtx = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t uhid_output_cond = PTHREAD_COND_INITIALIZER;
-static unsigned char output_report[10];
-
-/* no need to protect uhid_stopped, only one thread accesses it */
-static bool uhid_stopped;
-
-static int uhid_write(struct __test_metadata *_metadata, int fd, const struct uhid_event *ev)
-{
-	ssize_t ret;
-
-	ret = write(fd, ev, sizeof(*ev));
-	if (ret < 0) {
-		TH_LOG("Cannot write to uhid: %m");
-		return -errno;
-	} else if (ret != sizeof(*ev)) {
-		TH_LOG("Wrong size written to uhid: %zd != %zu",
-			ret, sizeof(ev));
-		return -EFAULT;
-	} else {
-		return 0;
-	}
-}
-
-static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb)
-{
-	struct uhid_event ev;
-	char buf[25];
-
-	sprintf(buf, "test-uhid-device-%d", rand_nb);
-
-	memset(&ev, 0, sizeof(ev));
-	ev.type = UHID_CREATE;
-	strcpy((char *)ev.u.create.name, buf);
-	ev.u.create.rd_data = rdesc;
-	ev.u.create.rd_size = sizeof(rdesc);
-	ev.u.create.bus = BUS_USB;
-	ev.u.create.vendor = 0x0001;
-	ev.u.create.product = 0x0a37;
-	ev.u.create.version = 0;
-	ev.u.create.country = 0;
-
-	sprintf(buf, "%d", rand_nb);
-	strcpy((char *)ev.u.create.phys, buf);
-
-	return uhid_write(_metadata, fd, &ev);
-}
-
-static void uhid_destroy(struct __test_metadata *_metadata, int fd)
-{
-	struct uhid_event ev;
-
-	memset(&ev, 0, sizeof(ev));
-	ev.type = UHID_DESTROY;
-
-	uhid_write(_metadata, fd, &ev);
-}
-
-static int uhid_event(struct __test_metadata *_metadata, int fd)
-{
-	struct uhid_event ev, answer;
-	ssize_t ret;
-
-	memset(&ev, 0, sizeof(ev));
-	ret = read(fd, &ev, sizeof(ev));
-	if (ret == 0) {
-		UHID_LOG("Read HUP on uhid-cdev");
-		return -EFAULT;
-	} else if (ret < 0) {
-		UHID_LOG("Cannot read uhid-cdev: %m");
-		return -errno;
-	} else if (ret != sizeof(ev)) {
-		UHID_LOG("Invalid size read from uhid-dev: %zd != %zu",
-			ret, sizeof(ev));
-		return -EFAULT;
-	}
-
-	switch (ev.type) {
-	case UHID_START:
-		pthread_mutex_lock(&uhid_started_mtx);
-		pthread_cond_signal(&uhid_started);
-		pthread_mutex_unlock(&uhid_started_mtx);
-
-		UHID_LOG("UHID_START from uhid-dev");
-		break;
-	case UHID_STOP:
-		uhid_stopped = true;
-
-		UHID_LOG("UHID_STOP from uhid-dev");
-		break;
-	case UHID_OPEN:
-		UHID_LOG("UHID_OPEN from uhid-dev");
-		break;
-	case UHID_CLOSE:
-		UHID_LOG("UHID_CLOSE from uhid-dev");
-		break;
-	case UHID_OUTPUT:
-		UHID_LOG("UHID_OUTPUT from uhid-dev");
-
-		pthread_mutex_lock(&uhid_output_mtx);
-		memcpy(output_report,
-		       ev.u.output.data,
-		       min(ev.u.output.size, sizeof(output_report)));
-		pthread_cond_signal(&uhid_output_cond);
-		pthread_mutex_unlock(&uhid_output_mtx);
-		break;
-	case UHID_GET_REPORT:
-		UHID_LOG("UHID_GET_REPORT from uhid-dev");
-
-		answer.type = UHID_GET_REPORT_REPLY;
-		answer.u.get_report_reply.id = ev.u.get_report.id;
-		answer.u.get_report_reply.err = ev.u.get_report.rnum == 1 ? 0 : -EIO;
-		answer.u.get_report_reply.size = sizeof(feature_data);
-		memcpy(answer.u.get_report_reply.data, feature_data, sizeof(feature_data));
-
-		uhid_write(_metadata, fd, &answer);
-
-		break;
-	case UHID_SET_REPORT:
-		UHID_LOG("UHID_SET_REPORT from uhid-dev");
-		break;
-	default:
-		TH_LOG("Invalid event from uhid-dev: %u", ev.type);
-	}
-
-	return 0;
-}
-
-struct uhid_thread_args {
-	int fd;
-	struct __test_metadata *_metadata;
-};
-static void *uhid_read_events_thread(void *arg)
-{
-	struct uhid_thread_args *args = (struct uhid_thread_args *)arg;
-	struct __test_metadata *_metadata = args->_metadata;
-	struct pollfd pfds[1];
-	int fd = args->fd;
-	int ret = 0;
-
-	pfds[0].fd = fd;
-	pfds[0].events = POLLIN;
-
-	uhid_stopped = false;
-
-	while (!uhid_stopped) {
-		ret = poll(pfds, 1, 100);
-		if (ret < 0) {
-			TH_LOG("Cannot poll for fds: %m");
-			break;
-		}
-		if (pfds[0].revents & POLLIN) {
-			ret = uhid_event(_metadata, fd);
-			if (ret)
-				break;
-		}
-	}
-
-	return (void *)(long)ret;
-}
-
-static int uhid_start_listener(struct __test_metadata *_metadata, pthread_t *tid, int uhid_fd)
-{
-	struct uhid_thread_args args = {
-		.fd = uhid_fd,
-		._metadata = _metadata,
-	};
-	int err;
-
-	pthread_mutex_lock(&uhid_started_mtx);
-	err = pthread_create(tid, NULL, uhid_read_events_thread, (void *)&args);
-	ASSERT_EQ(0, err) {
-		TH_LOG("Could not start the uhid thread: %d", err);
-		pthread_mutex_unlock(&uhid_started_mtx);
-		close(uhid_fd);
-		return -EIO;
-	}
-	pthread_cond_wait(&uhid_started, &uhid_started_mtx);
-	pthread_mutex_unlock(&uhid_started_mtx);
-
-	return 0;
-}
-
-static int uhid_send_event(struct __test_metadata *_metadata, int fd, __u8 *buf, size_t size)
-{
-	struct uhid_event ev;
-
-	if (size > sizeof(ev.u.input.data))
-		return -E2BIG;
-
-	memset(&ev, 0, sizeof(ev));
-	ev.type = UHID_INPUT2;
-	ev.u.input2.size = size;
-
-	memcpy(ev.u.input2.data, buf, size);
-
-	return uhid_write(_metadata, fd, &ev);
-}
-
-static int setup_uhid(struct __test_metadata *_metadata, int rand_nb)
-{
-	int fd;
-	const char *path = "/dev/uhid";
-	int ret;
-
-	fd = open(path, O_RDWR | O_CLOEXEC);
-	ASSERT_GE(fd, 0) TH_LOG("open uhid-cdev failed; %d", fd);
-
-	ret = uhid_create(_metadata, fd, rand_nb);
-	ASSERT_EQ(0, ret) {
-		TH_LOG("create uhid device failed: %d", ret);
-		close(fd);
-	}
-
-	return fd;
-}
-
-static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *dir)
-{
-	const char *target = "0003:0001:0A37.*";
-	char phys[512];
-	char uevent[1024];
-	char temp[512];
-	int fd, nread;
-	bool found = false;
-
-	if (fnmatch(target, dir->d_name, 0))
-		return false;
-
-	/* we found the correct VID/PID, now check for phys */
-	sprintf(uevent, "%s/%s/uevent", workdir, dir->d_name);
-
-	fd = open(uevent, O_RDONLY | O_NONBLOCK);
-	if (fd < 0)
-		return false;
-
-	sprintf(phys, "PHYS=%d", dev_id);
-
-	nread = read(fd, temp, ARRAY_SIZE(temp));
-	if (nread > 0 && (strstr(temp, phys)) != NULL)
-		found = true;
-
-	close(fd);
-
-	return found;
-}
-
-static int get_hid_id(int dev_id)
-{
-	const char *workdir = "/sys/devices/virtual/misc/uhid";
-	const char *str_id;
-	DIR *d;
-	struct dirent *dir;
-	int found = -1, attempts = 3;
-
-	/* it would be nice to be able to use nftw, but the no_alu32 target doesn't support it */
-
-	while (found < 0 && attempts > 0) {
-		attempts--;
-		d = opendir(workdir);
-		if (d) {
-			while ((dir = readdir(d)) != NULL) {
-				if (!match_sysfs_device(dev_id, workdir, dir))
-					continue;
-
-				str_id = dir->d_name + sizeof("0003:0001:0A37.");
-				found = (int)strtol(str_id, NULL, 16);
-
-				break;
-			}
-			closedir(d);
-		}
-		if (found < 0)
-			usleep(100000);
-	}
-
-	return found;
-}
-
-static int get_hidraw(int dev_id)
-{
-	const char *workdir = "/sys/devices/virtual/misc/uhid";
-	char sysfs[1024];
-	DIR *d, *subd;
-	struct dirent *dir, *subdir;
-	int i, found = -1;
-
-	/* retry 5 times in case the system is loaded */
-	for (i = 5; i > 0; i--) {
-		usleep(10);
-		d = opendir(workdir);
-
-		if (!d)
-			continue;
-
-		while ((dir = readdir(d)) != NULL) {
-			if (!match_sysfs_device(dev_id, workdir, dir))
-				continue;
-
-			sprintf(sysfs, "%s/%s/hidraw", workdir, dir->d_name);
-
-			subd = opendir(sysfs);
-			if (!subd)
-				continue;
-
-			while ((subdir = readdir(subd)) != NULL) {
-				if (fnmatch("hidraw*", subdir->d_name, 0))
-					continue;
-
-				found = atoi(subdir->d_name + strlen("hidraw"));
-			}
-
-			closedir(subd);
-
-			if (found > 0)
-				break;
-		}
-		closedir(d);
-	}
-
-	return found;
-}
-
-static int open_hidraw(int dev_id)
-{
-	int hidraw_number;
-	char hidraw_path[64] = { 0 };
-
-	hidraw_number = get_hidraw(dev_id);
-	if (hidraw_number < 0)
-		return hidraw_number;
-
-	/* open hidraw node to check the other side of the pipe */
-	sprintf(hidraw_path, "/dev/hidraw%d", hidraw_number);
-	return open(hidraw_path, O_RDWR | O_NONBLOCK);
-}
-
 FIXTURE(hid_bpf) {
 	int dev_id;
 	int uhid_fd;
diff --git a/tools/testing/selftests/hid/hid_common.h b/tools/testing/selftests/hid/hid_common.h
new file mode 100644
index 000000000000..f151f151a1ed
--- /dev/null
+++ b/tools/testing/selftests/hid/hid_common.h
@@ -0,0 +1,436 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2022-2024 Red Hat */
+
+#include "../kselftest_harness.h"
+
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <dirent.h>
+#include <poll.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <linux/hidraw.h>
+#include <linux/uhid.h>
+
+#define SHOW_UHID_DEBUG 0
+
+#define min(a, b) \
+	({ __typeof__(a) _a = (a); \
+	__typeof__(b) _b = (b); \
+	_a < _b ? _a : _b; })
+
+static unsigned char rdesc[] = {
+	0x06, 0x00, 0xff,	/* Usage Page (Vendor Defined Page 1) */
+	0x09, 0x21,		/* Usage (Vendor Usage 0x21) */
+	0xa1, 0x01,		/* COLLECTION (Application) */
+	0x09, 0x01,			/* Usage (Vendor Usage 0x01) */
+	0xa1, 0x00,			/* COLLECTION (Physical) */
+	0x85, 0x02,				/* REPORT_ID (2) */
+	0x19, 0x01,				/* USAGE_MINIMUM (1) */
+	0x29, 0x08,				/* USAGE_MAXIMUM (3) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0xff,				/* LOGICAL_MAXIMUM (255) */
+	0x95, 0x08,				/* REPORT_COUNT (8) */
+	0x75, 0x08,				/* REPORT_SIZE (8) */
+	0x81, 0x02,				/* INPUT (Data,Var,Abs) */
+	0xc0,				/* END_COLLECTION */
+	0x09, 0x01,			/* Usage (Vendor Usage 0x01) */
+	0xa1, 0x00,			/* COLLECTION (Physical) */
+	0x85, 0x01,				/* REPORT_ID (1) */
+	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
+	0x19, 0x01,				/* USAGE_MINIMUM (1) */
+	0x29, 0x03,				/* USAGE_MAXIMUM (3) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
+	0x95, 0x03,				/* REPORT_COUNT (3) */
+	0x75, 0x01,				/* REPORT_SIZE (1) */
+	0x81, 0x02,				/* INPUT (Data,Var,Abs) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x75, 0x05,				/* REPORT_SIZE (5) */
+	0x81, 0x01,				/* INPUT (Cnst,Var,Abs) */
+	0x05, 0x01,				/* USAGE_PAGE (Generic Desktop) */
+	0x09, 0x30,				/* USAGE (X) */
+	0x09, 0x31,				/* USAGE (Y) */
+	0x15, 0x81,				/* LOGICAL_MINIMUM (-127) */
+	0x25, 0x7f,				/* LOGICAL_MAXIMUM (127) */
+	0x75, 0x10,				/* REPORT_SIZE (16) */
+	0x95, 0x02,				/* REPORT_COUNT (2) */
+	0x81, 0x06,				/* INPUT (Data,Var,Rel) */
+
+	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
+	0x19, 0x01,				/* USAGE_MINIMUM (1) */
+	0x29, 0x03,				/* USAGE_MAXIMUM (3) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
+	0x95, 0x03,				/* REPORT_COUNT (3) */
+	0x75, 0x01,				/* REPORT_SIZE (1) */
+	0x91, 0x02,				/* Output (Data,Var,Abs) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x75, 0x05,				/* REPORT_SIZE (5) */
+	0x91, 0x01,				/* Output (Cnst,Var,Abs) */
+
+	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
+	0x19, 0x06,				/* USAGE_MINIMUM (6) */
+	0x29, 0x08,				/* USAGE_MAXIMUM (8) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
+	0x95, 0x03,				/* REPORT_COUNT (3) */
+	0x75, 0x01,				/* REPORT_SIZE (1) */
+	0xb1, 0x02,				/* Feature (Data,Var,Abs) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x75, 0x05,				/* REPORT_SIZE (5) */
+	0x91, 0x01,				/* Output (Cnst,Var,Abs) */
+
+	0xc0,				/* END_COLLECTION */
+	0xc0,			/* END_COLLECTION */
+};
+
+static __u8 feature_data[] = { 1, 2 };
+
+#define ASSERT_OK(data) ASSERT_FALSE(data)
+#define ASSERT_OK_PTR(ptr) ASSERT_NE(NULL, ptr)
+
+#define UHID_LOG(fmt, ...) do { \
+	if (SHOW_UHID_DEBUG) \
+		TH_LOG(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER;
+
+static pthread_mutex_t uhid_output_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t uhid_output_cond = PTHREAD_COND_INITIALIZER;
+static unsigned char output_report[10];
+
+/* no need to protect uhid_stopped, only one thread accesses it */
+static bool uhid_stopped;
+
+static int uhid_write(struct __test_metadata *_metadata, int fd, const struct uhid_event *ev)
+{
+	ssize_t ret;
+
+	ret = write(fd, ev, sizeof(*ev));
+	if (ret < 0) {
+		TH_LOG("Cannot write to uhid: %m");
+		return -errno;
+	} else if (ret != sizeof(*ev)) {
+		TH_LOG("Wrong size written to uhid: %zd != %zu",
+			ret, sizeof(ev));
+		return -EFAULT;
+	} else {
+		return 0;
+	}
+}
+
+static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb)
+{
+	struct uhid_event ev;
+	char buf[25];
+
+	sprintf(buf, "test-uhid-device-%d", rand_nb);
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = UHID_CREATE;
+	strcpy((char *)ev.u.create.name, buf);
+	ev.u.create.rd_data = rdesc;
+	ev.u.create.rd_size = sizeof(rdesc);
+	ev.u.create.bus = BUS_USB;
+	ev.u.create.vendor = 0x0001;
+	ev.u.create.product = 0x0a37;
+	ev.u.create.version = 0;
+	ev.u.create.country = 0;
+
+	sprintf(buf, "%d", rand_nb);
+	strcpy((char *)ev.u.create.phys, buf);
+
+	return uhid_write(_metadata, fd, &ev);
+}
+
+static void uhid_destroy(struct __test_metadata *_metadata, int fd)
+{
+	struct uhid_event ev;
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = UHID_DESTROY;
+
+	uhid_write(_metadata, fd, &ev);
+}
+
+static int uhid_event(struct __test_metadata *_metadata, int fd)
+{
+	struct uhid_event ev, answer;
+	ssize_t ret;
+
+	memset(&ev, 0, sizeof(ev));
+	ret = read(fd, &ev, sizeof(ev));
+	if (ret == 0) {
+		UHID_LOG("Read HUP on uhid-cdev");
+		return -EFAULT;
+	} else if (ret < 0) {
+		UHID_LOG("Cannot read uhid-cdev: %m");
+		return -errno;
+	} else if (ret != sizeof(ev)) {
+		UHID_LOG("Invalid size read from uhid-dev: %zd != %zu",
+			ret, sizeof(ev));
+		return -EFAULT;
+	}
+
+	switch (ev.type) {
+	case UHID_START:
+		pthread_mutex_lock(&uhid_started_mtx);
+		pthread_cond_signal(&uhid_started);
+		pthread_mutex_unlock(&uhid_started_mtx);
+
+		UHID_LOG("UHID_START from uhid-dev");
+		break;
+	case UHID_STOP:
+		uhid_stopped = true;
+
+		UHID_LOG("UHID_STOP from uhid-dev");
+		break;
+	case UHID_OPEN:
+		UHID_LOG("UHID_OPEN from uhid-dev");
+		break;
+	case UHID_CLOSE:
+		UHID_LOG("UHID_CLOSE from uhid-dev");
+		break;
+	case UHID_OUTPUT:
+		UHID_LOG("UHID_OUTPUT from uhid-dev");
+
+		pthread_mutex_lock(&uhid_output_mtx);
+		memcpy(output_report,
+		       ev.u.output.data,
+		       min(ev.u.output.size, sizeof(output_report)));
+		pthread_cond_signal(&uhid_output_cond);
+		pthread_mutex_unlock(&uhid_output_mtx);
+		break;
+	case UHID_GET_REPORT:
+		UHID_LOG("UHID_GET_REPORT from uhid-dev");
+
+		answer.type = UHID_GET_REPORT_REPLY;
+		answer.u.get_report_reply.id = ev.u.get_report.id;
+		answer.u.get_report_reply.err = ev.u.get_report.rnum == 1 ? 0 : -EIO;
+		answer.u.get_report_reply.size = sizeof(feature_data);
+		memcpy(answer.u.get_report_reply.data, feature_data, sizeof(feature_data));
+
+		uhid_write(_metadata, fd, &answer);
+
+		break;
+	case UHID_SET_REPORT:
+		UHID_LOG("UHID_SET_REPORT from uhid-dev");
+		break;
+	default:
+		TH_LOG("Invalid event from uhid-dev: %u", ev.type);
+	}
+
+	return 0;
+}
+
+struct uhid_thread_args {
+	int fd;
+	struct __test_metadata *_metadata;
+};
+static void *uhid_read_events_thread(void *arg)
+{
+	struct uhid_thread_args *args = (struct uhid_thread_args *)arg;
+	struct __test_metadata *_metadata = args->_metadata;
+	struct pollfd pfds[1];
+	int fd = args->fd;
+	int ret = 0;
+
+	pfds[0].fd = fd;
+	pfds[0].events = POLLIN;
+
+	uhid_stopped = false;
+
+	while (!uhid_stopped) {
+		ret = poll(pfds, 1, 100);
+		if (ret < 0) {
+			TH_LOG("Cannot poll for fds: %m");
+			break;
+		}
+		if (pfds[0].revents & POLLIN) {
+			ret = uhid_event(_metadata, fd);
+			if (ret)
+				break;
+		}
+	}
+
+	return (void *)(long)ret;
+}
+
+static int uhid_start_listener(struct __test_metadata *_metadata, pthread_t *tid, int uhid_fd)
+{
+	struct uhid_thread_args args = {
+		.fd = uhid_fd,
+		._metadata = _metadata,
+	};
+	int err;
+
+	pthread_mutex_lock(&uhid_started_mtx);
+	err = pthread_create(tid, NULL, uhid_read_events_thread, (void *)&args);
+	ASSERT_EQ(0, err) {
+		TH_LOG("Could not start the uhid thread: %d", err);
+		pthread_mutex_unlock(&uhid_started_mtx);
+		close(uhid_fd);
+		return -EIO;
+	}
+	pthread_cond_wait(&uhid_started, &uhid_started_mtx);
+	pthread_mutex_unlock(&uhid_started_mtx);
+
+	return 0;
+}
+
+static int uhid_send_event(struct __test_metadata *_metadata, int fd, __u8 *buf, size_t size)
+{
+	struct uhid_event ev;
+
+	if (size > sizeof(ev.u.input.data))
+		return -E2BIG;
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = UHID_INPUT2;
+	ev.u.input2.size = size;
+
+	memcpy(ev.u.input2.data, buf, size);
+
+	return uhid_write(_metadata, fd, &ev);
+}
+
+static int setup_uhid(struct __test_metadata *_metadata, int rand_nb)
+{
+	int fd;
+	const char *path = "/dev/uhid";
+	int ret;
+
+	fd = open(path, O_RDWR | O_CLOEXEC);
+	ASSERT_GE(fd, 0) TH_LOG("open uhid-cdev failed; %d", fd);
+
+	ret = uhid_create(_metadata, fd, rand_nb);
+	ASSERT_EQ(0, ret) {
+		TH_LOG("create uhid device failed: %d", ret);
+		close(fd);
+	}
+
+	return fd;
+}
+
+static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *dir)
+{
+	const char *target = "0003:0001:0A37.*";
+	char phys[512];
+	char uevent[1024];
+	char temp[512];
+	int fd, nread;
+	bool found = false;
+
+	if (fnmatch(target, dir->d_name, 0))
+		return false;
+
+	/* we found the correct VID/PID, now check for phys */
+	sprintf(uevent, "%s/%s/uevent", workdir, dir->d_name);
+
+	fd = open(uevent, O_RDONLY | O_NONBLOCK);
+	if (fd < 0)
+		return false;
+
+	sprintf(phys, "PHYS=%d", dev_id);
+
+	nread = read(fd, temp, ARRAY_SIZE(temp));
+	if (nread > 0 && (strstr(temp, phys)) != NULL)
+		found = true;
+
+	close(fd);
+
+	return found;
+}
+
+static int get_hid_id(int dev_id)
+{
+	const char *workdir = "/sys/devices/virtual/misc/uhid";
+	const char *str_id;
+	DIR *d;
+	struct dirent *dir;
+	int found = -1, attempts = 3;
+
+	/* it would be nice to be able to use nftw, but the no_alu32 target doesn't support it */
+
+	while (found < 0 && attempts > 0) {
+		attempts--;
+		d = opendir(workdir);
+		if (d) {
+			while ((dir = readdir(d)) != NULL) {
+				if (!match_sysfs_device(dev_id, workdir, dir))
+					continue;
+
+				str_id = dir->d_name + sizeof("0003:0001:0A37.");
+				found = (int)strtol(str_id, NULL, 16);
+
+				break;
+			}
+			closedir(d);
+		}
+		if (found < 0)
+			usleep(100000);
+	}
+
+	return found;
+}
+
+static int get_hidraw(int dev_id)
+{
+	const char *workdir = "/sys/devices/virtual/misc/uhid";
+	char sysfs[1024];
+	DIR *d, *subd;
+	struct dirent *dir, *subdir;
+	int i, found = -1;
+
+	/* retry 5 times in case the system is loaded */
+	for (i = 5; i > 0; i--) {
+		usleep(10);
+		d = opendir(workdir);
+
+		if (!d)
+			continue;
+
+		while ((dir = readdir(d)) != NULL) {
+			if (!match_sysfs_device(dev_id, workdir, dir))
+				continue;
+
+			sprintf(sysfs, "%s/%s/hidraw", workdir, dir->d_name);
+
+			subd = opendir(sysfs);
+			if (!subd)
+				continue;
+
+			while ((subdir = readdir(subd)) != NULL) {
+				if (fnmatch("hidraw*", subdir->d_name, 0))
+					continue;
+
+				found = atoi(subdir->d_name + strlen("hidraw"));
+			}
+
+			closedir(subd);
+
+			if (found > 0)
+				break;
+		}
+		closedir(d);
+	}
+
+	return found;
+}
+
+static int open_hidraw(int dev_id)
+{
+	int hidraw_number;
+	char hidraw_path[64] = { 0 };
+
+	hidraw_number = get_hidraw(dev_id);
+	if (hidraw_number < 0)
+		return hidraw_number;
+
+	/* open hidraw node to check the other side of the pipe */
+	sprintf(hidraw_path, "/dev/hidraw%d", hidraw_number);
+	return open(hidraw_path, O_RDWR | O_NONBLOCK);
+}
-- 
cgit v1.2.3


From 8163892a629ca544af575ce54955bf275a3250cd Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 27 Aug 2024 17:19:31 +0900
Subject: selftests/hid: Add initial hidraw tests skeleton

Largely inspired from hid_bpf.c for the fixture setup.

Create a couple of tests for hidraw:
- create a uhid device and check if the fixture is working properly
- inject one uhid event and read it through hidraw

These tests are not that useful for now, but will be once we start adding
the ioctl and BPFs to revoke the hidraw node.

Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Link: https://patch.msgid.link/20240827-hidraw-revoke-v5-3-d004a7451aea@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 tools/testing/selftests/hid/.gitignore |  1 +
 tools/testing/selftests/hid/Makefile   |  2 +-
 tools/testing/selftests/hid/hidraw.c   | 90 ++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/hid/hidraw.c

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/.gitignore b/tools/testing/selftests/hid/.gitignore
index 995af0670f69..746c62361f77 100644
--- a/tools/testing/selftests/hid/.gitignore
+++ b/tools/testing/selftests/hid/.gitignore
@@ -2,4 +2,5 @@ bpftool
 *.skel.h
 /tools
 hid_bpf
+hidraw
 results
diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile
index 2b5ea18bde38..72be55ac4bdf 100644
--- a/tools/testing/selftests/hid/Makefile
+++ b/tools/testing/selftests/hid/Makefile
@@ -32,7 +32,7 @@ CFLAGS += -Wno-unused-command-line-argument
 endif
 
 # Order correspond to 'make run_tests' order
-TEST_GEN_PROGS = hid_bpf
+TEST_GEN_PROGS = hid_bpf hidraw
 
 # Emit succinct information message describing current building step
 # $1 - generic step name (e.g., CC, LINK, etc);
diff --git a/tools/testing/selftests/hid/hidraw.c b/tools/testing/selftests/hid/hidraw.c
new file mode 100644
index 000000000000..37372709130c
--- /dev/null
+++ b/tools/testing/selftests/hid/hidraw.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022-2024 Red Hat */
+
+#include "hid_common.h"
+
+FIXTURE(hidraw) {
+	int dev_id;
+	int uhid_fd;
+	int hidraw_fd;
+	int hid_id;
+	pthread_t tid;
+};
+static void close_hidraw(FIXTURE_DATA(hidraw) * self)
+{
+	if (self->hidraw_fd)
+		close(self->hidraw_fd);
+	self->hidraw_fd = 0;
+}
+
+FIXTURE_TEARDOWN(hidraw) {
+	void *uhid_err;
+
+	uhid_destroy(_metadata, self->uhid_fd);
+
+	close_hidraw(self);
+	pthread_join(self->tid, &uhid_err);
+}
+#define TEARDOWN_LOG(fmt, ...) do { \
+	TH_LOG(fmt, ##__VA_ARGS__); \
+	hidraw_teardown(_metadata, self, variant); \
+} while (0)
+
+FIXTURE_SETUP(hidraw)
+{
+	time_t t;
+	int err;
+
+	/* initialize random number generator */
+	srand((unsigned int)time(&t));
+
+	self->dev_id = rand() % 1024;
+
+	self->uhid_fd = setup_uhid(_metadata, self->dev_id);
+
+	/* locate the uev, self, variant);ent file of the created device */
+	self->hid_id = get_hid_id(self->dev_id);
+	ASSERT_GT(self->hid_id, 0)
+		TEARDOWN_LOG("Could not locate uhid device id: %d", self->hid_id);
+
+	err = uhid_start_listener(_metadata, &self->tid, self->uhid_fd);
+	ASSERT_EQ(0, err) TEARDOWN_LOG("could not start udev listener: %d", err);
+
+	self->hidraw_fd = open_hidraw(self->dev_id);
+	ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw");
+}
+
+/*
+ * A simple test to see if the fixture is working fine.
+ * If this fails, none of the other tests will pass.
+ */
+TEST_F(hidraw, test_create_uhid)
+{
+}
+
+/*
+ * Inject one event in the uhid device,
+ * check that we get the same data through hidraw
+ */
+TEST_F(hidraw, raw_event)
+{
+	__u8 buf[10] = {0};
+	int err;
+
+	/* inject one event */
+	buf[0] = 1;
+	buf[1] = 42;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[0], 1);
+	ASSERT_EQ(buf[1], 42);
+}
+
+int main(int argc, char **argv)
+{
+	return test_harness_run(argc, argv);
+}
-- 
cgit v1.2.3


From 321f7798cfb8d834ae0ed0d467c8bf46804243f9 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 27 Aug 2024 17:19:32 +0900
Subject: selftests/hid: Add HIDIOCREVOKE tests

Add 4 tests for the new revoke ioctl, for read/write/ioctl and poll.

Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Link: https://patch.msgid.link/20240827-hidraw-revoke-v5-4-d004a7451aea@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 tools/testing/selftests/hid/hidraw.c | 147 +++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hidraw.c b/tools/testing/selftests/hid/hidraw.c
index 37372709130c..f8b4f7ff292c 100644
--- a/tools/testing/selftests/hid/hidraw.c
+++ b/tools/testing/selftests/hid/hidraw.c
@@ -3,6 +3,11 @@
 
 #include "hid_common.h"
 
+/* for older kernels */
+#ifndef HIDIOCREVOKE
+#define HIDIOCREVOKE	      _IOW('H', 0x0D, int) /* Revoke device access */
+#endif /* HIDIOCREVOKE */
+
 FIXTURE(hidraw) {
 	int dev_id;
 	int uhid_fd;
@@ -84,6 +89,148 @@ TEST_F(hidraw, raw_event)
 	ASSERT_EQ(buf[1], 42);
 }
 
+/*
+ * After initial opening/checks of hidraw, revoke the hidraw
+ * node and check that we can not read any more data.
+ */
+TEST_F(hidraw, raw_event_revoked)
+{
+	__u8 buf[10] = {0};
+	int err;
+
+	/* inject one event */
+	buf[0] = 1;
+	buf[1] = 42;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[0], 1);
+	ASSERT_EQ(buf[1], 42);
+
+	/* call the revoke ioctl */
+	err = ioctl(self->hidraw_fd, HIDIOCREVOKE, NULL);
+	ASSERT_OK(err) TH_LOG("couldn't revoke the hidraw fd");
+
+	/* inject one other event */
+	buf[0] = 1;
+	buf[1] = 43;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, -1) TH_LOG("read_hidraw");
+	ASSERT_EQ(errno, ENODEV) TH_LOG("unexpected error code while reading the hidraw node: %d",
+					errno);
+}
+
+/*
+ * Revoke the hidraw node and check that we can not do any ioctl.
+ */
+TEST_F(hidraw, ioctl_revoked)
+{
+	int err, desc_size = 0;
+
+	/* call the revoke ioctl */
+	err = ioctl(self->hidraw_fd, HIDIOCREVOKE, NULL);
+	ASSERT_OK(err) TH_LOG("couldn't revoke the hidraw fd");
+
+	/* do an ioctl */
+	err = ioctl(self->hidraw_fd, HIDIOCGRDESCSIZE, &desc_size);
+	ASSERT_EQ(err, -1) TH_LOG("ioctl_hidraw");
+	ASSERT_EQ(errno, ENODEV) TH_LOG("unexpected error code while doing an ioctl: %d",
+					errno);
+}
+
+/*
+ * Setup polling of the fd, and check that revoke works properly.
+ */
+TEST_F(hidraw, poll_revoked)
+{
+	struct pollfd pfds[1];
+	__u8 buf[10] = {0};
+	int err, ready;
+
+	/* setup polling */
+	pfds[0].fd = self->hidraw_fd;
+	pfds[0].events = POLLIN;
+
+	/* inject one event */
+	buf[0] = 1;
+	buf[1] = 42;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	while (true) {
+		ready = poll(pfds, 1, 5000);
+		ASSERT_EQ(ready, 1) TH_LOG("poll return value");
+
+		if (pfds[0].revents & POLLIN) {
+			memset(buf, 0, sizeof(buf));
+			err = read(self->hidraw_fd, buf, sizeof(buf));
+			ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+			ASSERT_EQ(buf[0], 1);
+			ASSERT_EQ(buf[1], 42);
+
+			/* call the revoke ioctl */
+			err = ioctl(self->hidraw_fd, HIDIOCREVOKE, NULL);
+			ASSERT_OK(err) TH_LOG("couldn't revoke the hidraw fd");
+		} else {
+			break;
+		}
+	}
+
+	ASSERT_TRUE(pfds[0].revents & POLLHUP);
+}
+
+/*
+ * After initial opening/checks of hidraw, revoke the hidraw
+ * node and check that we can not read any more data.
+ */
+TEST_F(hidraw, write_event_revoked)
+{
+	struct timespec time_to_wait;
+	__u8 buf[10] = {0};
+	int err;
+
+	/* inject one event from hidraw */
+	buf[0] = 1; /* report ID */
+	buf[1] = 2;
+	buf[2] = 42;
+
+	pthread_mutex_lock(&uhid_output_mtx);
+
+	memset(output_report, 0, sizeof(output_report));
+	clock_gettime(CLOCK_REALTIME, &time_to_wait);
+	time_to_wait.tv_sec += 2;
+
+	err = write(self->hidraw_fd, buf, 3);
+	ASSERT_EQ(err, 3) TH_LOG("unexpected error while writing to hidraw node: %d", err);
+
+	err = pthread_cond_timedwait(&uhid_output_cond, &uhid_output_mtx, &time_to_wait);
+	ASSERT_OK(err) TH_LOG("error while calling waiting for the condition");
+
+	ASSERT_EQ(output_report[0], 1);
+	ASSERT_EQ(output_report[1], 2);
+	ASSERT_EQ(output_report[2], 42);
+
+	/* call the revoke ioctl */
+	err = ioctl(self->hidraw_fd, HIDIOCREVOKE, NULL);
+	ASSERT_OK(err) TH_LOG("couldn't revoke the hidraw fd");
+
+	/* inject one other event */
+	buf[0] = 1;
+	buf[1] = 43;
+	err = write(self->hidraw_fd, buf, 3);
+	ASSERT_LT(err, 0) TH_LOG("unexpected success while writing to hidraw node: %d", err);
+	ASSERT_EQ(errno, ENODEV) TH_LOG("unexpected error code while writing to hidraw node: %d",
+					errno);
+
+	pthread_mutex_unlock(&uhid_output_mtx);
+}
+
 int main(int argc, char **argv)
 {
 	return test_harness_run(argc, argv);
-- 
cgit v1.2.3


From 0fd77ae4a3c94eca3f07e01083680ad0056df628 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 29 Aug 2024 11:46:40 -0300
Subject: Revert "tools build: Remove leftover libcap tests that prevents fast
 path feature detection from working"

Ian pointed out that the libcap feature test is also used by bpftool, so
we can't remove it just because perf stopped using it, revert the
removal of the feature test.

Since both perf and libcap uses the fast path feature detection
(tools/build/feature/test-all.c), probably the best thing is to keep
libcap-devel when building perf even it not being used there.

This reverts commit 47b3b6435e4bfb61ae8ffc63a11bd3c310f69acf.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/feature/Makefile      |  6 +++++-
 tools/build/feature/test-libcap.c | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 tools/build/feature/test-libcap.c

(limited to 'tools')

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index b873fc58804e..12796808f07a 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -25,6 +25,7 @@ FILES=                                          \
          test-libbfd-liberty-z.bin              \
          test-cplus-demangle.bin                \
          test-cxa-demangle.bin                  \
+         test-libcap.bin			\
          test-libelf.bin                        \
          test-libelf-getphdrnum.bin             \
          test-libelf-gelf_getnote.bin           \
@@ -111,7 +112,7 @@ all: $(FILES)
 __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
   BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
   BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl
-  BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd
+  BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd -lcap
 
 __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
   BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
@@ -139,6 +140,9 @@ $(OUTPUT)test-fortify-source.bin:
 $(OUTPUT)test-bionic.bin:
 	$(BUILD)
 
+$(OUTPUT)test-libcap.bin:
+	$(BUILD) -lcap
+
 $(OUTPUT)test-libelf.bin:
 	$(BUILD) -lelf
 
diff --git a/tools/build/feature/test-libcap.c b/tools/build/feature/test-libcap.c
new file mode 100644
index 000000000000..d2a2e152195f
--- /dev/null
+++ b/tools/build/feature/test-libcap.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/capability.h>
+#include <linux/capability.h>
+
+int main(void)
+{
+	cap_flag_value_t val;
+	cap_t caps = cap_get_proc();
+
+	if (!caps)
+		return 1;
+
+	if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val) != 0)
+		return 1;
+
+	if (cap_free(caps) != 0)
+		return 1;
+
+	return 0;
+}
-- 
cgit v1.2.3


From b6aa0de9a53a231eb068ce1e62b5e7ec9e30e627 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:43 +0100
Subject: perf cs-etm: Create decoders after both AUX and HW_ID search passes

Both of these passes gather information about how to create the
decoders. AUX records determine formatted/unformatted, and the HW_IDs
determine the traceID/metadata mappings.

Therefore it makes sense to cache the information and wait until both
passes are over until creating the decoders, rather than creating them
at the first HW_ID found.

This will allow a simplification of the creation process where
cs_etm_queue->traceid_list will exclusively used to create the decoders,
rather than the current two methods depending on whether the trace is
formatted or not.

Previously the sample CPU from the AUX record was used to initialize
the decoder CPU, but actually sample CPU == AUX queue index in per-CPU
mode, so saving the sample CPU isn't required.

Similarly formatted/unformatted was used upfront to create the decoders,
but now it's cached until later.

Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Tested-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Acked-by: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Torgue <alexandre.torgue@foss.st.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240722101202.26915-2-james.clark@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cs-etm.c | 182 +++++++++++++++++++++++++++++------------------
 1 file changed, 113 insertions(+), 69 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 34552940ec1d..0915a4356fb3 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -97,12 +97,19 @@ struct cs_etm_traceid_queue {
 	struct cs_etm_packet_queue packet_queue;
 };
 
+enum cs_etm_format {
+	UNSET,
+	FORMATTED,
+	UNFORMATTED
+};
+
 struct cs_etm_queue {
 	struct cs_etm_auxtrace *etm;
 	struct cs_etm_decoder *decoder;
 	struct auxtrace_buffer *buffer;
 	unsigned int queue_nr;
 	u8 pending_timestamp_chan_id;
+	enum cs_etm_format format;
 	u64 offset;
 	const unsigned char *buf;
 	size_t buf_len, buf_used;
@@ -696,7 +703,7 @@ static void cs_etm__set_trace_param_ete(struct cs_etm_trace_params *t_params,
 
 static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params,
 				     struct cs_etm_auxtrace *etm,
-				     bool formatted,
+				     enum cs_etm_format format,
 				     int sample_cpu,
 				     int decoders)
 {
@@ -705,7 +712,7 @@ static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params,
 	u64 architecture;
 
 	for (t_idx = 0; t_idx < decoders; t_idx++) {
-		if (formatted)
+		if (format == FORMATTED)
 			m_idx = t_idx;
 		else {
 			m_idx = get_cpu_data_idx(etm, sample_cpu);
@@ -738,8 +745,7 @@ static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params,
 
 static int cs_etm__init_decoder_params(struct cs_etm_decoder_params *d_params,
 				       struct cs_etm_queue *etmq,
-				       enum cs_etm_decoder_operation mode,
-				       bool formatted)
+				       enum cs_etm_decoder_operation mode)
 {
 	int ret = -EINVAL;
 
@@ -749,7 +755,7 @@ static int cs_etm__init_decoder_params(struct cs_etm_decoder_params *d_params,
 	d_params->packet_printer = cs_etm__packet_dump;
 	d_params->operation = mode;
 	d_params->data = etmq;
-	d_params->formatted = formatted;
+	d_params->formatted = etmq->format == FORMATTED;
 	d_params->fsyncs = false;
 	d_params->hsyncs = false;
 	d_params->frame_aligned = true;
@@ -1041,81 +1047,34 @@ out:
 	return ret;
 }
 
-static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
-						bool formatted, int sample_cpu)
+static struct cs_etm_queue *cs_etm__alloc_queue(void)
 {
-	struct cs_etm_decoder_params d_params;
-	struct cs_etm_trace_params  *t_params = NULL;
-	struct cs_etm_queue *etmq;
-	/*
-	 * Each queue can only contain data from one CPU when unformatted, so only one decoder is
-	 * needed.
-	 */
-	int decoders = formatted ? etm->num_cpu : 1;
-
-	etmq = zalloc(sizeof(*etmq));
+	struct cs_etm_queue *etmq = zalloc(sizeof(*etmq));
 	if (!etmq)
 		return NULL;
 
 	etmq->traceid_queues_list = intlist__new(NULL);
 	if (!etmq->traceid_queues_list)
-		goto out_free;
-
-	/* Use metadata to fill in trace parameters for trace decoder */
-	t_params = zalloc(sizeof(*t_params) * decoders);
-
-	if (!t_params)
-		goto out_free;
-
-	if (cs_etm__init_trace_params(t_params, etm, formatted, sample_cpu, decoders))
-		goto out_free;
-
-	/* Set decoder parameters to decode trace packets */
-	if (cs_etm__init_decoder_params(&d_params, etmq,
-					dump_trace ? CS_ETM_OPERATION_PRINT :
-						     CS_ETM_OPERATION_DECODE,
-					formatted))
-		goto out_free;
-
-	etmq->decoder = cs_etm_decoder__new(decoders, &d_params,
-					    t_params);
-
-	if (!etmq->decoder)
-		goto out_free;
-
-	/*
-	 * Register a function to handle all memory accesses required by
-	 * the trace decoder library.
-	 */
-	if (cs_etm_decoder__add_mem_access_cb(etmq->decoder,
-					      0x0L, ((u64) -1L),
-					      cs_etm__mem_access))
-		goto out_free_decoder;
+		free(etmq);
 
-	zfree(&t_params);
 	return etmq;
-
-out_free_decoder:
-	cs_etm_decoder__free(etmq->decoder);
-out_free:
-	intlist__delete(etmq->traceid_queues_list);
-	free(etmq);
-
-	return NULL;
 }
 
 static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 			       struct auxtrace_queue *queue,
-			       unsigned int queue_nr,
-			       bool formatted,
-			       int sample_cpu)
+			       unsigned int queue_nr, enum cs_etm_format format)
 {
 	struct cs_etm_queue *etmq = queue->priv;
 
+	if (etmq && format != etmq->format) {
+		pr_err("CS_ETM: mixed formatted and unformatted trace not supported\n");
+		return -EINVAL;
+	}
+
 	if (list_empty(&queue->head) || etmq)
 		return 0;
 
-	etmq = cs_etm__alloc_queue(etm, formatted, sample_cpu);
+	etmq = cs_etm__alloc_queue();
 
 	if (!etmq)
 		return -ENOMEM;
@@ -1123,7 +1082,9 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 	queue->priv = etmq;
 	etmq->etm = etm;
 	etmq->queue_nr = queue_nr;
+	queue->cpu = queue_nr; /* Placeholder, may be reset to -1 in per-thread mode */
 	etmq->offset = 0;
+	etmq->format = format;
 
 	return 0;
 }
@@ -2818,7 +2779,7 @@ static int cs_etm__process_auxtrace_event(struct perf_session *session,
 		 * formatted in piped mode (true).
 		 */
 		err = cs_etm__setup_queue(etm, &etm->queues.queue_array[idx],
-					  idx, true, -1);
+					  idx, FORMATTED);
 		if (err)
 			return err;
 
@@ -2939,7 +2900,7 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 	union perf_event auxtrace_fragment;
 	__u64 aux_offset, aux_size;
 	__u32 idx;
-	bool formatted;
+	enum cs_etm_format format;
 
 	struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
 						   struct cs_etm_auxtrace,
@@ -3022,9 +2983,10 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 			return err;
 
 		idx = auxtrace_event->idx;
-		formatted = !(aux_event->flags & PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW);
-		return cs_etm__setup_queue(etm, &etm->queues.queue_array[idx],
-					   idx, formatted, sample->cpu);
+		format = (aux_event->flags & PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW) ?
+				UNFORMATTED : FORMATTED;
+
+		return cs_etm__setup_queue(etm, &etm->queues.queue_array[idx], idx, format);
 	}
 
 	/* Wasn't inside this buffer, but there were no parse errors. 1 == 'not found' */
@@ -3208,6 +3170,84 @@ static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata)
 	return 0;
 }
 
+/*
+ * Use the data gathered by the peeks for HW_ID (trace ID mappings) and AUX
+ * (formatted or not) packets to create the decoders.
+ */
+static int cs_etm__create_queue_decoders(struct cs_etm_queue *etmq)
+{
+	struct cs_etm_decoder_params d_params;
+
+	/*
+	 * Each queue can only contain data from one CPU when unformatted, so only one decoder is
+	 * needed.
+	 */
+	int decoders = etmq->format == FORMATTED ? etmq->etm->num_cpu : 1;
+
+	/* Use metadata to fill in trace parameters for trace decoder */
+	struct cs_etm_trace_params  *t_params = zalloc(sizeof(*t_params) * decoders);
+
+	if (!t_params)
+		goto out_free;
+
+	if (cs_etm__init_trace_params(t_params, etmq->etm, etmq->format,
+				      etmq->queue_nr, decoders))
+		goto out_free;
+
+	/* Set decoder parameters to decode trace packets */
+	if (cs_etm__init_decoder_params(&d_params, etmq,
+					dump_trace ? CS_ETM_OPERATION_PRINT :
+						     CS_ETM_OPERATION_DECODE))
+		goto out_free;
+
+	etmq->decoder = cs_etm_decoder__new(decoders, &d_params,
+					    t_params);
+
+	if (!etmq->decoder)
+		goto out_free;
+
+	/*
+	 * Register a function to handle all memory accesses required by
+	 * the trace decoder library.
+	 */
+	if (cs_etm_decoder__add_mem_access_cb(etmq->decoder,
+					      0x0L, ((u64) -1L),
+					      cs_etm__mem_access))
+		goto out_free_decoder;
+
+	zfree(&t_params);
+	return 0;
+
+out_free_decoder:
+	cs_etm_decoder__free(etmq->decoder);
+out_free:
+	zfree(&t_params);
+	return -EINVAL;
+}
+
+static int cs_etm__create_decoders(struct cs_etm_auxtrace *etm)
+{
+	struct auxtrace_queues *queues = &etm->queues;
+
+	for (unsigned int i = 0; i < queues->nr_queues; i++) {
+		bool empty = list_empty(&queues->queue_array[i].head);
+		struct cs_etm_queue *etmq = queues->queue_array[i].priv;
+		int ret;
+
+		/*
+		 * Don't create decoders for empty queues, mainly because
+		 * etmq->format is unknown for empty queues.
+		 */
+		if (empty)
+			continue;
+
+		ret = cs_etm__create_queue_decoders(etmq);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 int cs_etm__process_auxtrace_info_full(union perf_event *event,
 				       struct perf_session *session)
 {
@@ -3371,6 +3411,10 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	if (err)
 		goto err_free_queues;
 
+	err = cs_etm__queue_aux_records(session);
+	if (err)
+		goto err_free_queues;
+
 	/*
 	 * Map Trace ID values to CPU metadata.
 	 *
@@ -3393,7 +3437,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	 * flags if present.
 	 */
 
-	/* first scan for AUX_OUTPUT_HW_ID records to map trace ID values to CPU metadata */
+	/* Scan for AUX_OUTPUT_HW_ID records to map trace ID values to CPU metadata */
 	aux_hw_id_found = 0;
 	err = perf_session__peek_events(session, session->header.data_offset,
 					session->header.data_size,
@@ -3411,7 +3455,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	if (err)
 		goto err_free_queues;
 
-	err = cs_etm__queue_aux_records(session);
+	err = cs_etm__create_decoders(etm);
 	if (err)
 		goto err_free_queues;
 
-- 
cgit v1.2.3


From 57880a7966be510cdaa08ab76b9d63e3df786bf0 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:44 +0100
Subject: perf: cs-etm: Allocate queues for all CPUs

Make cs_etm__setup_queue() setup a queue even if it's empty, and
pre-allocate queues based on the max CPU that was recorded. In per-CPU
mode aux queues are indexed based on CPU ID even if all CPUs aren't
recorded, sparse queue arrays aren't used.

This will allow HW_IDs to be saved even if no aux data was received in
that queue without having to call cs_etm__setup_queue() from two
different places.

Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Torgue <alexandre.torgue@foss.st.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240722101202.26915-3-james.clark@linaro.org
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cs-etm.c | 53 +++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 0915a4356fb3..03df8894a900 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -1062,16 +1062,11 @@ static struct cs_etm_queue *cs_etm__alloc_queue(void)
 
 static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 			       struct auxtrace_queue *queue,
-			       unsigned int queue_nr, enum cs_etm_format format)
+			       unsigned int queue_nr)
 {
 	struct cs_etm_queue *etmq = queue->priv;
 
-	if (etmq && format != etmq->format) {
-		pr_err("CS_ETM: mixed formatted and unformatted trace not supported\n");
-		return -EINVAL;
-	}
-
-	if (list_empty(&queue->head) || etmq)
+	if (etmq)
 		return 0;
 
 	etmq = cs_etm__alloc_queue();
@@ -1084,7 +1079,6 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 	etmq->queue_nr = queue_nr;
 	queue->cpu = queue_nr; /* Placeholder, may be reset to -1 in per-thread mode */
 	etmq->offset = 0;
-	etmq->format = format;
 
 	return 0;
 }
@@ -2772,17 +2766,6 @@ static int cs_etm__process_auxtrace_event(struct perf_session *session,
 		if (err)
 			return err;
 
-		/*
-		 * Knowing if the trace is formatted or not requires a lookup of
-		 * the aux record so only works in non-piped mode where data is
-		 * queued in cs_etm__queue_aux_records(). Always assume
-		 * formatted in piped mode (true).
-		 */
-		err = cs_etm__setup_queue(etm, &etm->queues.queue_array[idx],
-					  idx, FORMATTED);
-		if (err)
-			return err;
-
 		if (dump_trace)
 			if (auxtrace_buffer__get_data(buffer, fd)) {
 				cs_etm__dump_event(etm->queues.queue_array[idx].priv, buffer);
@@ -2899,7 +2882,6 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 	struct perf_record_auxtrace *auxtrace_event;
 	union perf_event auxtrace_fragment;
 	__u64 aux_offset, aux_size;
-	__u32 idx;
 	enum cs_etm_format format;
 
 	struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
@@ -2966,6 +2948,8 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 
 	if (aux_offset >= auxtrace_event->offset &&
 	    aux_offset + aux_size <= auxtrace_event->offset + auxtrace_event->size) {
+		struct cs_etm_queue *etmq = etm->queues.queue_array[auxtrace_event->idx].priv;
+
 		/*
 		 * If this AUX event was inside this buffer somewhere, create a new auxtrace event
 		 * based on the sizes of the aux event, and queue that fragment.
@@ -2982,11 +2966,14 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 		if (err)
 			return err;
 
-		idx = auxtrace_event->idx;
 		format = (aux_event->flags & PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW) ?
 				UNFORMATTED : FORMATTED;
-
-		return cs_etm__setup_queue(etm, &etm->queues.queue_array[idx], idx, format);
+		if (etmq->format != UNSET && format != etmq->format) {
+			pr_err("CS_ETM: mixed formatted and unformatted trace not supported\n");
+			return -EINVAL;
+		}
+		etmq->format = format;
+		return 0;
 	}
 
 	/* Wasn't inside this buffer, but there were no parse errors. 1 == 'not found' */
@@ -3238,6 +3225,7 @@ static int cs_etm__create_decoders(struct cs_etm_auxtrace *etm)
 		 * Don't create decoders for empty queues, mainly because
 		 * etmq->format is unknown for empty queues.
 		 */
+		assert(empty == (etmq->format == UNSET));
 		if (empty)
 			continue;
 
@@ -3257,10 +3245,10 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	int event_header_size = sizeof(struct perf_event_header);
 	int total_size = auxtrace_info->header.size;
 	int priv_size = 0;
-	int num_cpu;
+	int num_cpu, max_cpu = 0;
 	int err = 0;
 	int aux_hw_id_found;
-	int i, j;
+	int i;
 	u64 *ptr = NULL;
 	u64 **metadata = NULL;
 
@@ -3291,7 +3279,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	 * required by the trace decoder to properly decode the trace due
 	 * to its highly compressed nature.
 	 */
-	for (j = 0; j < num_cpu; j++) {
+	for (int j = 0; j < num_cpu; j++) {
 		if (ptr[i] == __perf_cs_etmv3_magic) {
 			metadata[j] =
 				cs_etm__create_meta_blk(ptr, &i,
@@ -3315,6 +3303,9 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 			err = -ENOMEM;
 			goto err_free_metadata;
 		}
+
+		if ((int) metadata[j][CS_ETM_CPU] > max_cpu)
+			max_cpu = metadata[j][CS_ETM_CPU];
 	}
 
 	/*
@@ -3344,10 +3335,16 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	 */
 	etm->pid_fmt = cs_etm__init_pid_fmt(metadata[0]);
 
-	err = auxtrace_queues__init(&etm->queues);
+	err = auxtrace_queues__init_nr(&etm->queues, max_cpu + 1);
 	if (err)
 		goto err_free_etm;
 
+	for (unsigned int j = 0; j < etm->queues.nr_queues; ++j) {
+		err = cs_etm__setup_queue(etm, &etm->queues.queue_array[j], j);
+		if (err)
+			goto err_free_queues;
+	}
+
 	if (session->itrace_synth_opts->set) {
 		etm->synth_opts = *session->itrace_synth_opts;
 	} else {
@@ -3469,7 +3466,7 @@ err_free_etm:
 	zfree(&etm);
 err_free_metadata:
 	/* No need to check @metadata[j], free(NULL) is supported */
-	for (j = 0; j < num_cpu; j++)
+	for (int j = 0; j < num_cpu; j++)
 		zfree(&metadata[j]);
 	zfree(&metadata);
 err_free_traceid_list:
-- 
cgit v1.2.3


From c634d6f4e12d00c954410ba11db45799a8c77b5b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 27 Aug 2024 13:37:21 -0700
Subject: libbpf: Fix bpf_object__open_skeleton()'s mishandling of options
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We do an ugly copying of options in bpf_object__open_skeleton() just to
be able to set object name from skeleton's recorded name (while still
allowing user to override it through opts->object_name).

This is not just ugly, but it also is broken due to memcpy() that
doesn't take into account potential skel_opts' and user-provided opts'
sizes differences due to backward and forward compatibility. This leads
to copying over extra bytes and then failing to validate options
properly. It could, technically, lead also to SIGSEGV, if we are unlucky.

So just get rid of that memory copy completely and instead pass
default object name into bpf_object_open() directly, simplifying all
this significantly. The rule now is that obj_name should be non-NULL for
bpf_object_open() when called with in-memory buffer, so validate that
explicitly as well.

We adopt bpf_object__open_mem() to this as well and generate default
name (based on buffer memory address and size) outside of bpf_object_open().

Fixes: d66562fba1ce ("libbpf: Add BPF object skeleton support")
Reported-by: Daniel Müller <deso@posteo.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Daniel Müller <deso@posteo.net>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240827203721.1145494-1-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 52 ++++++++++++++++++--------------------------------
 1 file changed, 19 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e55353887439..d3a542649e6b 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7905,16 +7905,19 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object
 }
 
 static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz,
+					  const char *obj_name,
 					  const struct bpf_object_open_opts *opts)
 {
-	const char *obj_name, *kconfig, *btf_tmp_path, *token_path;
+	const char *kconfig, *btf_tmp_path, *token_path;
 	struct bpf_object *obj;
-	char tmp_name[64];
 	int err;
 	char *log_buf;
 	size_t log_size;
 	__u32 log_level;
 
+	if (obj_buf && !obj_name)
+		return ERR_PTR(-EINVAL);
+
 	if (elf_version(EV_CURRENT) == EV_NONE) {
 		pr_warn("failed to init libelf for %s\n",
 			path ? : "(mem buf)");
@@ -7924,16 +7927,12 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf,
 	if (!OPTS_VALID(opts, bpf_object_open_opts))
 		return ERR_PTR(-EINVAL);
 
-	obj_name = OPTS_GET(opts, object_name, NULL);
+	obj_name = OPTS_GET(opts, object_name, NULL) ?: obj_name;
 	if (obj_buf) {
-		if (!obj_name) {
-			snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx",
-				 (unsigned long)obj_buf,
-				 (unsigned long)obj_buf_sz);
-			obj_name = tmp_name;
-		}
 		path = obj_name;
 		pr_debug("loading object '%s' from buffer\n", obj_name);
+	} else {
+		pr_debug("loading object from %s\n", path);
 	}
 
 	log_buf = OPTS_GET(opts, kernel_log_buf, NULL);
@@ -8017,9 +8016,7 @@ bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts)
 	if (!path)
 		return libbpf_err_ptr(-EINVAL);
 
-	pr_debug("loading %s\n", path);
-
-	return libbpf_ptr(bpf_object_open(path, NULL, 0, opts));
+	return libbpf_ptr(bpf_object_open(path, NULL, 0, NULL, opts));
 }
 
 struct bpf_object *bpf_object__open(const char *path)
@@ -8031,10 +8028,15 @@ struct bpf_object *
 bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
 		     const struct bpf_object_open_opts *opts)
 {
+	char tmp_name[64];
+
 	if (!obj_buf || obj_buf_sz == 0)
 		return libbpf_err_ptr(-EINVAL);
 
-	return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts));
+	/* create a (quite useless) default "name" for this memory buffer object */
+	snprintf(tmp_name, sizeof(tmp_name), "%lx-%zx", (unsigned long)obj_buf, obj_buf_sz);
+
+	return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, tmp_name, opts));
 }
 
 static int bpf_object_unload(struct bpf_object *obj)
@@ -13761,29 +13763,13 @@ static int populate_skeleton_progs(const struct bpf_object *obj,
 int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
 			      const struct bpf_object_open_opts *opts)
 {
-	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts,
-		.object_name = s->name,
-	);
 	struct bpf_object *obj;
 	int err;
 
-	/* Attempt to preserve opts->object_name, unless overriden by user
-	 * explicitly. Overwriting object name for skeletons is discouraged,
-	 * as it breaks global data maps, because they contain object name
-	 * prefix as their own map name prefix. When skeleton is generated,
-	 * bpftool is making an assumption that this name will stay the same.
-	 */
-	if (opts) {
-		memcpy(&skel_opts, opts, sizeof(*opts));
-		if (!opts->object_name)
-			skel_opts.object_name = s->name;
-	}
-
-	obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts);
-	err = libbpf_get_error(obj);
-	if (err) {
-		pr_warn("failed to initialize skeleton BPF object '%s': %d\n",
-			s->name, err);
+	obj = bpf_object_open(NULL, s->data, s->data_sz, s->name, opts);
+	if (IS_ERR(obj)) {
+		err = PTR_ERR(obj);
+		pr_warn("failed to initialize skeleton BPF object '%s': %d\n", s->name, err);
 		return libbpf_err(err);
 	}
 
-- 
cgit v1.2.3


From 77c123f53e97ad4bde0271eb671b71774a99ebf6 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:45 +0100
Subject: perf: cs-etm: Move traceid_list to each queue

The global list won't work for per-sink trace ID allocations, so put a
list in each queue where the IDs will be unique to that queue.

To keep the same behavior as before, for version 0 of the HW_ID packets,
copy all the HW_ID mappings into all queues.

This change doesn't effect the decoders, only trace ID lookups on the
Perf side. The decoders are still created with global mappings which
will be fixed in a later commit.

Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Torgue <alexandre.torgue@foss.st.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240722101202.26915-4-james.clark@linaro.org
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cs-etm-decoder/cs-etm-decoder.c |  28 +--
 tools/perf/util/cs-etm.c                        | 215 ++++++++++++++----------
 tools/perf/util/cs-etm.h                        |   2 +-
 3 files changed, 147 insertions(+), 98 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index e917985bbbe6..0c9c48cedbf1 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -388,7 +388,8 @@ cs_etm_decoder__reset_timestamp(struct cs_etm_packet_queue *packet_queue)
 }
 
 static ocsd_datapath_resp_t
-cs_etm_decoder__buffer_packet(struct cs_etm_packet_queue *packet_queue,
+cs_etm_decoder__buffer_packet(struct cs_etm_queue *etmq,
+			      struct cs_etm_packet_queue *packet_queue,
 			      const u8 trace_chan_id,
 			      enum cs_etm_sample_type sample_type)
 {
@@ -398,7 +399,7 @@ cs_etm_decoder__buffer_packet(struct cs_etm_packet_queue *packet_queue,
 	if (packet_queue->packet_count >= CS_ETM_PACKET_MAX_BUFFER - 1)
 		return OCSD_RESP_FATAL_SYS_ERR;
 
-	if (cs_etm__get_cpu(trace_chan_id, &cpu) < 0)
+	if (cs_etm__get_cpu(etmq, trace_chan_id, &cpu) < 0)
 		return OCSD_RESP_FATAL_SYS_ERR;
 
 	et = packet_queue->tail;
@@ -436,7 +437,7 @@ cs_etm_decoder__buffer_range(struct cs_etm_queue *etmq,
 	int ret = 0;
 	struct cs_etm_packet *packet;
 
-	ret = cs_etm_decoder__buffer_packet(packet_queue, trace_chan_id,
+	ret = cs_etm_decoder__buffer_packet(etmq, packet_queue, trace_chan_id,
 					    CS_ETM_RANGE);
 	if (ret != OCSD_RESP_CONT && ret != OCSD_RESP_WAIT)
 		return ret;
@@ -496,7 +497,8 @@ out:
 }
 
 static ocsd_datapath_resp_t
-cs_etm_decoder__buffer_discontinuity(struct cs_etm_packet_queue *queue,
+cs_etm_decoder__buffer_discontinuity(struct cs_etm_queue *etmq,
+				     struct cs_etm_packet_queue *queue,
 				     const uint8_t trace_chan_id)
 {
 	/*
@@ -504,18 +506,19 @@ cs_etm_decoder__buffer_discontinuity(struct cs_etm_packet_queue *queue,
 	 * reset time statistics.
 	 */
 	cs_etm_decoder__reset_timestamp(queue);
-	return cs_etm_decoder__buffer_packet(queue, trace_chan_id,
+	return cs_etm_decoder__buffer_packet(etmq, queue, trace_chan_id,
 					     CS_ETM_DISCONTINUITY);
 }
 
 static ocsd_datapath_resp_t
-cs_etm_decoder__buffer_exception(struct cs_etm_packet_queue *queue,
+cs_etm_decoder__buffer_exception(struct cs_etm_queue *etmq,
+				 struct cs_etm_packet_queue *queue,
 				 const ocsd_generic_trace_elem *elem,
 				 const uint8_t trace_chan_id)
 {	int ret = 0;
 	struct cs_etm_packet *packet;
 
-	ret = cs_etm_decoder__buffer_packet(queue, trace_chan_id,
+	ret = cs_etm_decoder__buffer_packet(etmq, queue, trace_chan_id,
 					    CS_ETM_EXCEPTION);
 	if (ret != OCSD_RESP_CONT && ret != OCSD_RESP_WAIT)
 		return ret;
@@ -527,10 +530,11 @@ cs_etm_decoder__buffer_exception(struct cs_etm_packet_queue *queue,
 }
 
 static ocsd_datapath_resp_t
-cs_etm_decoder__buffer_exception_ret(struct cs_etm_packet_queue *queue,
+cs_etm_decoder__buffer_exception_ret(struct cs_etm_queue *etmq,
+				     struct cs_etm_packet_queue *queue,
 				     const uint8_t trace_chan_id)
 {
-	return cs_etm_decoder__buffer_packet(queue, trace_chan_id,
+	return cs_etm_decoder__buffer_packet(etmq, queue, trace_chan_id,
 					     CS_ETM_EXCEPTION_RET);
 }
 
@@ -599,7 +603,7 @@ static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer(
 	case OCSD_GEN_TRC_ELEM_EO_TRACE:
 	case OCSD_GEN_TRC_ELEM_NO_SYNC:
 	case OCSD_GEN_TRC_ELEM_TRACE_ON:
-		resp = cs_etm_decoder__buffer_discontinuity(packet_queue,
+		resp = cs_etm_decoder__buffer_discontinuity(etmq, packet_queue,
 							    trace_chan_id);
 		break;
 	case OCSD_GEN_TRC_ELEM_INSTR_RANGE:
@@ -607,11 +611,11 @@ static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer(
 						    trace_chan_id);
 		break;
 	case OCSD_GEN_TRC_ELEM_EXCEPTION:
-		resp = cs_etm_decoder__buffer_exception(packet_queue, elem,
+		resp = cs_etm_decoder__buffer_exception(etmq, packet_queue, elem,
 							trace_chan_id);
 		break;
 	case OCSD_GEN_TRC_ELEM_EXCEPTION_RET:
-		resp = cs_etm_decoder__buffer_exception_ret(packet_queue,
+		resp = cs_etm_decoder__buffer_exception_ret(etmq, packet_queue,
 							    trace_chan_id);
 		break;
 	case OCSD_GEN_TRC_ELEM_TIMESTAMP:
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 03df8894a900..0f56618fc7a8 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -116,16 +116,18 @@ struct cs_etm_queue {
 	/* Conversion between traceID and index in traceid_queues array */
 	struct intlist *traceid_queues_list;
 	struct cs_etm_traceid_queue **traceid_queues;
+	/* Conversion between traceID and metadata pointers */
+	struct intlist *traceid_list;
 };
 
-/* RB tree for quick conversion between traceID and metadata pointers */
-static struct intlist *traceid_list;
-
 static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm);
 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
 					   pid_t tid);
 static int cs_etm__get_data_block(struct cs_etm_queue *etmq);
 static int cs_etm__decode_data_block(struct cs_etm_queue *etmq);
+static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata);
+static u64 *get_cpu_data(struct cs_etm_auxtrace *etm, int cpu);
+static int cs_etm__metadata_set_trace_id(u8 trace_chan_id, u64 *cpu_metadata);
 
 /* PTMs ETMIDR [11:8] set to b0011 */
 #define ETMIDR_PTM_VERSION 0x00000300
@@ -151,12 +153,12 @@ static u32 cs_etm__get_v7_protocol_version(u32 etmidr)
 	return CS_ETM_PROTO_ETMV3;
 }
 
-static int cs_etm__get_magic(u8 trace_chan_id, u64 *magic)
+static int cs_etm__get_magic(struct cs_etm_queue *etmq, u8 trace_chan_id, u64 *magic)
 {
 	struct int_node *inode;
 	u64 *metadata;
 
-	inode = intlist__find(traceid_list, trace_chan_id);
+	inode = intlist__find(etmq->traceid_list, trace_chan_id);
 	if (!inode)
 		return -EINVAL;
 
@@ -165,12 +167,12 @@ static int cs_etm__get_magic(u8 trace_chan_id, u64 *magic)
 	return 0;
 }
 
-int cs_etm__get_cpu(u8 trace_chan_id, int *cpu)
+int cs_etm__get_cpu(struct cs_etm_queue *etmq, u8 trace_chan_id, int *cpu)
 {
 	struct int_node *inode;
 	u64 *metadata;
 
-	inode = intlist__find(traceid_list, trace_chan_id);
+	inode = intlist__find(etmq->traceid_list, trace_chan_id);
 	if (!inode)
 		return -EINVAL;
 
@@ -222,30 +224,108 @@ enum cs_etm_pid_fmt cs_etm__get_pid_fmt(struct cs_etm_queue *etmq)
 	return etmq->etm->pid_fmt;
 }
 
-static int cs_etm__map_trace_id(u8 trace_chan_id, u64 *cpu_metadata)
+static int cs_etm__insert_trace_id_node(struct cs_etm_queue *etmq,
+					u8 trace_chan_id, u64 *cpu_metadata)
 {
-	struct int_node *inode;
-
 	/* Get an RB node for this CPU */
-	inode = intlist__findnew(traceid_list, trace_chan_id);
+	struct int_node *inode = intlist__findnew(etmq->traceid_list, trace_chan_id);
 
 	/* Something went wrong, no need to continue */
 	if (!inode)
 		return -ENOMEM;
 
+	/* Disallow re-mapping a different traceID to metadata pair. */
+	if (inode->priv) {
+		u64 *curr_cpu_data = inode->priv;
+		u8 curr_chan_id;
+		int err;
+
+		if (curr_cpu_data[CS_ETM_CPU] != cpu_metadata[CS_ETM_CPU]) {
+			pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n");
+			return -EINVAL;
+		}
+
+		/* check that the mapped ID matches */
+		err = cs_etm__metadata_get_trace_id(&curr_chan_id, curr_cpu_data);
+		if (err)
+			return err;
+
+		if (curr_chan_id != trace_chan_id) {
+			pr_err("CS_ETM: mismatch between CPU trace ID and HW_ID packet ID\n");
+			return -EINVAL;
+		}
+
+		/* Skip re-adding the same mappings if everything matched */
+		return 0;
+	}
+
+	/* Not one we've seen before, associate the traceID with the metadata pointer */
+	inode->priv = cpu_metadata;
+
+	return 0;
+}
+
+static struct cs_etm_queue *cs_etm__get_queue(struct cs_etm_auxtrace *etm, int cpu)
+{
+	if (etm->per_thread_decoding)
+		return etm->queues.queue_array[0].priv;
+	else
+		return etm->queues.queue_array[cpu].priv;
+}
+
+static int cs_etm__map_trace_id_v0(struct cs_etm_auxtrace *etm, u8 trace_chan_id,
+				   u64 *cpu_metadata)
+{
+	struct cs_etm_queue *etmq;
+
 	/*
-	 * The node for that CPU should not be taken.
-	 * Back out if that's the case.
+	 * If the queue is unformatted then only save one mapping in the
+	 * queue associated with that CPU so only one decoder is made.
 	 */
-	if (inode->priv)
-		return -EINVAL;
+	etmq = cs_etm__get_queue(etm, cpu_metadata[CS_ETM_CPU]);
+	if (etmq->format == UNFORMATTED)
+		return cs_etm__insert_trace_id_node(etmq, trace_chan_id,
+						    cpu_metadata);
 
-	/* All good, associate the traceID with the metadata pointer */
-	inode->priv = cpu_metadata;
+	/*
+	 * Otherwise, version 0 trace IDs are global so save them into every
+	 * queue.
+	 */
+	for (unsigned int i = 0; i < etm->queues.nr_queues; ++i) {
+		int ret;
+
+		etmq = etm->queues.queue_array[i].priv;
+		ret = cs_etm__insert_trace_id_node(etmq, trace_chan_id,
+						   cpu_metadata);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
 
+static int cs_etm__process_trace_id_v0(struct cs_etm_auxtrace *etm, int cpu,
+				       u64 hw_id)
+{
+	int err;
+	u64 *cpu_data;
+	u8 trace_chan_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id);
+
+	cpu_data = get_cpu_data(etm, cpu);
+	if (cpu_data == NULL)
+		return -EINVAL;
+
+	err = cs_etm__map_trace_id_v0(etm, trace_chan_id, cpu_data);
+	if (err)
+		return err;
+
+	/*
+	 * if we are picking up the association from the packet, need to plug
+	 * the correct trace ID into the metadata for setting up decoders later.
+	 */
+	return cs_etm__metadata_set_trace_id(trace_chan_id, cpu_data);
+}
+
 static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata)
 {
 	u64 cs_etm_magic = cpu_metadata[CS_ETM_MAGIC];
@@ -329,17 +409,13 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session,
 {
 	struct cs_etm_auxtrace *etm;
 	struct perf_sample sample;
-	struct int_node *inode;
 	struct evsel *evsel;
-	u64 *cpu_data;
 	u64 hw_id;
 	int cpu, version, err;
-	u8 trace_chan_id, curr_chan_id;
 
 	/* extract and parse the HW ID */
 	hw_id = event->aux_output_hw_id.hw_id;
 	version = FIELD_GET(CS_AUX_HW_ID_VERSION_MASK, hw_id);
-	trace_chan_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id);
 
 	/* check that we can handle this version */
 	if (version > CS_AUX_HW_ID_CURR_VERSION) {
@@ -367,43 +443,7 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session,
 		return -EINVAL;
 	}
 
-	/* See if the ID is mapped to a CPU, and it matches the current CPU */
-	inode = intlist__find(traceid_list, trace_chan_id);
-	if (inode) {
-		cpu_data = inode->priv;
-		if ((int)cpu_data[CS_ETM_CPU] != cpu) {
-			pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n");
-			return -EINVAL;
-		}
-
-		/* check that the mapped ID matches */
-		err = cs_etm__metadata_get_trace_id(&curr_chan_id, cpu_data);
-		if (err)
-			return err;
-		if (curr_chan_id != trace_chan_id) {
-			pr_err("CS_ETM: mismatch between CPU trace ID and HW_ID packet ID\n");
-			return -EINVAL;
-		}
-
-		/* mapped and matched - return OK */
-		return 0;
-	}
-
-	cpu_data = get_cpu_data(etm, cpu);
-	if (cpu_data == NULL)
-		return err;
-
-	/* not one we've seen before - lets map it */
-	err = cs_etm__map_trace_id(trace_chan_id, cpu_data);
-	if (err)
-		return err;
-
-	/*
-	 * if we are picking up the association from the packet, need to plug
-	 * the correct trace ID into the metadata for setting up decoders later.
-	 */
-	err = cs_etm__metadata_set_trace_id(trace_chan_id, cpu_data);
-	return err;
+	return cs_etm__process_trace_id_v0(etm, cpu, hw_id);
 }
 
 void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
@@ -856,6 +896,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
 
 static void cs_etm__free_queue(void *priv)
 {
+	struct int_node *inode, *tmp;
 	struct cs_etm_queue *etmq = priv;
 
 	if (!etmq)
@@ -863,6 +904,14 @@ static void cs_etm__free_queue(void *priv)
 
 	cs_etm_decoder__free(etmq->decoder);
 	cs_etm__free_traceid_queues(etmq);
+
+	/* First remove all traceID/metadata nodes for the RB tree */
+	intlist__for_each_entry_safe(inode, tmp, etmq->traceid_list)
+		intlist__remove(etmq->traceid_list, inode);
+
+	/* Then the RB tree itself */
+	intlist__delete(etmq->traceid_list);
+
 	free(etmq);
 }
 
@@ -885,19 +934,12 @@ static void cs_etm__free_events(struct perf_session *session)
 static void cs_etm__free(struct perf_session *session)
 {
 	int i;
-	struct int_node *inode, *tmp;
 	struct cs_etm_auxtrace *aux = container_of(session->auxtrace,
 						   struct cs_etm_auxtrace,
 						   auxtrace);
 	cs_etm__free_events(session);
 	session->auxtrace = NULL;
 
-	/* First remove all traceID/metadata nodes for the RB tree */
-	intlist__for_each_entry_safe(inode, tmp, traceid_list)
-		intlist__remove(traceid_list, inode);
-	/* Then the RB tree itself */
-	intlist__delete(traceid_list);
-
 	for (i = 0; i < aux->num_cpu; i++)
 		zfree(&aux->metadata[i]);
 
@@ -1055,9 +1097,24 @@ static struct cs_etm_queue *cs_etm__alloc_queue(void)
 
 	etmq->traceid_queues_list = intlist__new(NULL);
 	if (!etmq->traceid_queues_list)
-		free(etmq);
+		goto out_free;
+
+	/*
+	 * Create an RB tree for traceID-metadata tuple.  Since the conversion
+	 * has to be made for each packet that gets decoded, optimizing access
+	 * in anything other than a sequential array is worth doing.
+	 */
+	etmq->traceid_list = intlist__new(NULL);
+	if (!etmq->traceid_list)
+		goto out_free;
 
 	return etmq;
+
+out_free:
+	intlist__delete(etmq->traceid_queues_list);
+	free(etmq);
+
+	return NULL;
 }
 
 static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
@@ -2182,7 +2239,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq,
 					      PERF_IP_FLAG_TRACE_END;
 		break;
 	case CS_ETM_EXCEPTION:
-		ret = cs_etm__get_magic(packet->trace_chan_id, &magic);
+		ret = cs_etm__get_magic(etmq, packet->trace_chan_id, &magic);
 		if (ret)
 			return ret;
 
@@ -3099,7 +3156,8 @@ static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu)
 }
 
 /* map trace ids to correct metadata block, from information in metadata */
-static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata)
+static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm, int num_cpu,
+					  u64 **metadata)
 {
 	u64 cs_etm_magic;
 	u8 trace_chan_id;
@@ -3121,7 +3179,7 @@ static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata)
 			/* unknown magic number */
 			return -EINVAL;
 		}
-		err = cs_etm__map_trace_id(trace_chan_id, metadata[i]);
+		err = cs_etm__map_trace_id_v0(etm, trace_chan_id, metadata[i]);
 		if (err)
 			return err;
 	}
@@ -3252,23 +3310,12 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	u64 *ptr = NULL;
 	u64 **metadata = NULL;
 
-	/*
-	 * Create an RB tree for traceID-metadata tuple.  Since the conversion
-	 * has to be made for each packet that gets decoded, optimizing access
-	 * in anything other than a sequential array is worth doing.
-	 */
-	traceid_list = intlist__new(NULL);
-	if (!traceid_list)
-		return -ENOMEM;
-
 	/* First the global part */
 	ptr = (u64 *) auxtrace_info->priv;
 	num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff;
 	metadata = zalloc(sizeof(*metadata) * num_cpu);
-	if (!metadata) {
-		err = -ENOMEM;
-		goto err_free_traceid_list;
-	}
+	if (!metadata)
+		return -ENOMEM;
 
 	/* Start parsing after the common part of the header */
 	i = CS_HEADER_VERSION_MAX;
@@ -3447,7 +3494,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 		err = cs_etm__clear_unused_trace_ids_metadata(num_cpu, metadata);
 	/* otherwise, this is a file with metadata values only, map from metadata */
 	else
-		err = cs_etm__map_trace_ids_metadata(num_cpu, metadata);
+		err = cs_etm__map_trace_ids_metadata(etm, num_cpu, metadata);
 
 	if (err)
 		goto err_free_queues;
@@ -3469,7 +3516,5 @@ err_free_metadata:
 	for (int j = 0; j < num_cpu; j++)
 		zfree(&metadata[j]);
 	zfree(&metadata);
-err_free_traceid_list:
-	intlist__delete(traceid_list);
 	return err;
 }
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 4696267a32f0..f4f69f7cc0f3 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -252,7 +252,7 @@ enum cs_etm_pid_fmt {
 
 #ifdef HAVE_CSTRACE_SUPPORT
 #include <opencsd/ocsd_if_types.h>
-int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
+int cs_etm__get_cpu(struct cs_etm_queue *etmq, u8 trace_chan_id, int *cpu);
 enum cs_etm_pid_fmt cs_etm__get_pid_fmt(struct cs_etm_queue *etmq);
 int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid,
 			    u8 trace_chan_id, ocsd_ex_level el);
-- 
cgit v1.2.3


From 19c3e4db38c5bf30c7e7b53dad5a464d7031dec4 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:46 +0100
Subject: perf: cs-etm: Create decoders based on the trace ID mappings

Now that each queue has a unique set of trace ID mappings, use this
list to create the decoders. In unformatted mode just add a single
mapping so only one decoder is made.

Previously each queue would have a decoder created for each traced CPU
on the system but this won't work anymore because CPUs can have
overlapping trace IDs.

This also means that the CORESIGHT_TRACE_ID_UNUSED_FLAG isn't needed
any more. If mappings aren't added then decoders aren't created, rather
than needing a flag to suppress creation.

Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Torgue <alexandre.torgue@foss.st.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240722101202.26915-5-james.clark@linaro.org
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c               |   8 +-
 tools/perf/util/cs-etm-decoder/cs-etm-decoder.c |   4 -
 tools/perf/util/cs-etm.c                        | 155 ++++++++----------------
 tools/perf/util/cs-etm.h                        |  10 --
 4 files changed, 55 insertions(+), 122 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 96aeb7cdbee1..3bbc1e57d71d 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -654,8 +654,7 @@ static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr,
 	/* Get trace configuration register */
 	data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_get_config(itr);
 	/* traceID set to legacy version, in case new perf running on older system */
-	data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) |
-				     CORESIGHT_TRACE_ID_UNUSED_FLAG;
+	data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 
 	/* Get read-only information from sysFS */
 	cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0],
@@ -687,7 +686,7 @@ static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, st
 	/* Get trace configuration register */
 	data[CS_ETE_TRCCONFIGR] = cs_etmv4_get_config(itr);
 	/* traceID set to legacy version, in case new perf running on older system */
-	data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG;
+	data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 
 	/* Get read-only information from sysFS */
 	cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR0], &data[CS_ETE_TRCIDR0]);
@@ -743,8 +742,7 @@ static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset,
 		/* Get configuration register */
 		info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr);
 		/* traceID set to legacy value in case new perf running on old system */
-		info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) |
-							   CORESIGHT_TRACE_ID_UNUSED_FLAG;
+		info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 		/* Get read-only information from sysFS */
 		cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv3_ro[CS_ETM_ETMCCER],
 			      &info->priv[*offset + CS_ETM_ETMCCER]);
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 0c9c48cedbf1..d49c3e9c7c21 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -684,10 +684,6 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params,
 		return -1;
 	}
 
-	/* if the CPU has no trace ID associated, no decoder needed */
-	if (csid == CORESIGHT_TRACE_ID_UNUSED_VAL)
-		return 0;
-
 	if (d_params->operation == CS_ETM_OPERATION_DECODE) {
 		if (ocsd_dt_create_decoder(decoder->dcd_tree,
 					   decoder->decoder_name,
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 0f56618fc7a8..01c60a660fb3 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -348,7 +348,6 @@ static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata)
 
 /*
  * update metadata trace ID from the value found in the AUX_HW_INFO packet.
- * This will also clear the CORESIGHT_TRACE_ID_UNUSED_FLAG flag if present.
  */
 static int cs_etm__metadata_set_trace_id(u8 trace_chan_id, u64 *cpu_metadata)
 {
@@ -700,80 +699,58 @@ static void cs_etm__packet_dump(const char *pkt_string)
 }
 
 static void cs_etm__set_trace_param_etmv3(struct cs_etm_trace_params *t_params,
-					  struct cs_etm_auxtrace *etm, int t_idx,
-					  int m_idx, u32 etmidr)
+					  u64 *metadata, u32 etmidr)
 {
-	u64 **metadata = etm->metadata;
-
-	t_params[t_idx].protocol = cs_etm__get_v7_protocol_version(etmidr);
-	t_params[t_idx].etmv3.reg_ctrl = metadata[m_idx][CS_ETM_ETMCR];
-	t_params[t_idx].etmv3.reg_trc_id = metadata[m_idx][CS_ETM_ETMTRACEIDR];
+	t_params->protocol = cs_etm__get_v7_protocol_version(etmidr);
+	t_params->etmv3.reg_ctrl = metadata[CS_ETM_ETMCR];
+	t_params->etmv3.reg_trc_id = metadata[CS_ETM_ETMTRACEIDR];
 }
 
 static void cs_etm__set_trace_param_etmv4(struct cs_etm_trace_params *t_params,
-					  struct cs_etm_auxtrace *etm, int t_idx,
-					  int m_idx)
+					  u64 *metadata)
 {
-	u64 **metadata = etm->metadata;
-
-	t_params[t_idx].protocol = CS_ETM_PROTO_ETMV4i;
-	t_params[t_idx].etmv4.reg_idr0 = metadata[m_idx][CS_ETMV4_TRCIDR0];
-	t_params[t_idx].etmv4.reg_idr1 = metadata[m_idx][CS_ETMV4_TRCIDR1];
-	t_params[t_idx].etmv4.reg_idr2 = metadata[m_idx][CS_ETMV4_TRCIDR2];
-	t_params[t_idx].etmv4.reg_idr8 = metadata[m_idx][CS_ETMV4_TRCIDR8];
-	t_params[t_idx].etmv4.reg_configr = metadata[m_idx][CS_ETMV4_TRCCONFIGR];
-	t_params[t_idx].etmv4.reg_traceidr = metadata[m_idx][CS_ETMV4_TRCTRACEIDR];
+	t_params->protocol = CS_ETM_PROTO_ETMV4i;
+	t_params->etmv4.reg_idr0 = metadata[CS_ETMV4_TRCIDR0];
+	t_params->etmv4.reg_idr1 = metadata[CS_ETMV4_TRCIDR1];
+	t_params->etmv4.reg_idr2 = metadata[CS_ETMV4_TRCIDR2];
+	t_params->etmv4.reg_idr8 = metadata[CS_ETMV4_TRCIDR8];
+	t_params->etmv4.reg_configr = metadata[CS_ETMV4_TRCCONFIGR];
+	t_params->etmv4.reg_traceidr = metadata[CS_ETMV4_TRCTRACEIDR];
 }
 
 static void cs_etm__set_trace_param_ete(struct cs_etm_trace_params *t_params,
-					  struct cs_etm_auxtrace *etm, int t_idx,
-					  int m_idx)
+					u64 *metadata)
 {
-	u64 **metadata = etm->metadata;
-
-	t_params[t_idx].protocol = CS_ETM_PROTO_ETE;
-	t_params[t_idx].ete.reg_idr0 = metadata[m_idx][CS_ETE_TRCIDR0];
-	t_params[t_idx].ete.reg_idr1 = metadata[m_idx][CS_ETE_TRCIDR1];
-	t_params[t_idx].ete.reg_idr2 = metadata[m_idx][CS_ETE_TRCIDR2];
-	t_params[t_idx].ete.reg_idr8 = metadata[m_idx][CS_ETE_TRCIDR8];
-	t_params[t_idx].ete.reg_configr = metadata[m_idx][CS_ETE_TRCCONFIGR];
-	t_params[t_idx].ete.reg_traceidr = metadata[m_idx][CS_ETE_TRCTRACEIDR];
-	t_params[t_idx].ete.reg_devarch = metadata[m_idx][CS_ETE_TRCDEVARCH];
+	t_params->protocol = CS_ETM_PROTO_ETE;
+	t_params->ete.reg_idr0 = metadata[CS_ETE_TRCIDR0];
+	t_params->ete.reg_idr1 = metadata[CS_ETE_TRCIDR1];
+	t_params->ete.reg_idr2 = metadata[CS_ETE_TRCIDR2];
+	t_params->ete.reg_idr8 = metadata[CS_ETE_TRCIDR8];
+	t_params->ete.reg_configr = metadata[CS_ETE_TRCCONFIGR];
+	t_params->ete.reg_traceidr = metadata[CS_ETE_TRCTRACEIDR];
+	t_params->ete.reg_devarch = metadata[CS_ETE_TRCDEVARCH];
 }
 
 static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params,
-				     struct cs_etm_auxtrace *etm,
-				     enum cs_etm_format format,
-				     int sample_cpu,
-				     int decoders)
-{
-	int t_idx, m_idx;
-	u32 etmidr;
-	u64 architecture;
-
-	for (t_idx = 0; t_idx < decoders; t_idx++) {
-		if (format == FORMATTED)
-			m_idx = t_idx;
-		else {
-			m_idx = get_cpu_data_idx(etm, sample_cpu);
-			if (m_idx == -1) {
-				pr_warning("CS_ETM: unknown CPU, falling back to first metadata\n");
-				m_idx = 0;
-			}
-		}
+				     struct cs_etm_queue *etmq)
+{
+	struct int_node *inode;
 
-		architecture = etm->metadata[m_idx][CS_ETM_MAGIC];
+	intlist__for_each_entry(inode, etmq->traceid_list) {
+		u64 *metadata = inode->priv;
+		u64 architecture = metadata[CS_ETM_MAGIC];
+		u32 etmidr;
 
 		switch (architecture) {
 		case __perf_cs_etmv3_magic:
-			etmidr = etm->metadata[m_idx][CS_ETM_ETMIDR];
-			cs_etm__set_trace_param_etmv3(t_params, etm, t_idx, m_idx, etmidr);
+			etmidr = metadata[CS_ETM_ETMIDR];
+			cs_etm__set_trace_param_etmv3(t_params++, metadata, etmidr);
 			break;
 		case __perf_cs_etmv4_magic:
-			cs_etm__set_trace_param_etmv4(t_params, etm, t_idx, m_idx);
+			cs_etm__set_trace_param_etmv4(t_params++, metadata);
 			break;
 		case __perf_cs_ete_magic:
-			cs_etm__set_trace_param_ete(t_params, etm, t_idx, m_idx);
+			cs_etm__set_trace_param_ete(t_params++, metadata);
 			break;
 		default:
 			return -EINVAL;
@@ -3186,35 +3163,6 @@ static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm, int num_c
 	return 0;
 }
 
-/*
- * If we found AUX_HW_ID packets, then set any metadata marked as unused to the
- * unused value to reduce the number of unneeded decoders created.
- */
-static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata)
-{
-	u64 cs_etm_magic;
-	int i;
-
-	for (i = 0; i < num_cpu; i++) {
-		cs_etm_magic = metadata[i][CS_ETM_MAGIC];
-		switch (cs_etm_magic) {
-		case __perf_cs_etmv3_magic:
-			if (metadata[i][CS_ETM_ETMTRACEIDR] & CORESIGHT_TRACE_ID_UNUSED_FLAG)
-				metadata[i][CS_ETM_ETMTRACEIDR] = CORESIGHT_TRACE_ID_UNUSED_VAL;
-			break;
-		case __perf_cs_etmv4_magic:
-		case __perf_cs_ete_magic:
-			if (metadata[i][CS_ETMV4_TRCTRACEIDR] & CORESIGHT_TRACE_ID_UNUSED_FLAG)
-				metadata[i][CS_ETMV4_TRCTRACEIDR] = CORESIGHT_TRACE_ID_UNUSED_VAL;
-			break;
-		default:
-			/* unknown magic number */
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
 /*
  * Use the data gathered by the peeks for HW_ID (trace ID mappings) and AUX
  * (formatted or not) packets to create the decoders.
@@ -3222,21 +3170,26 @@ static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata)
 static int cs_etm__create_queue_decoders(struct cs_etm_queue *etmq)
 {
 	struct cs_etm_decoder_params d_params;
+	struct cs_etm_trace_params  *t_params;
+	int decoders = intlist__nr_entries(etmq->traceid_list);
+
+	if (decoders == 0)
+		return 0;
 
 	/*
 	 * Each queue can only contain data from one CPU when unformatted, so only one decoder is
 	 * needed.
 	 */
-	int decoders = etmq->format == FORMATTED ? etmq->etm->num_cpu : 1;
+	if (etmq->format == UNFORMATTED)
+		assert(decoders == 1);
 
 	/* Use metadata to fill in trace parameters for trace decoder */
-	struct cs_etm_trace_params  *t_params = zalloc(sizeof(*t_params) * decoders);
+	t_params = zalloc(sizeof(*t_params) * decoders);
 
 	if (!t_params)
 		goto out_free;
 
-	if (cs_etm__init_trace_params(t_params, etmq->etm, etmq->format,
-				      etmq->queue_nr, decoders))
+	if (cs_etm__init_trace_params(t_params, etmq))
 		goto out_free;
 
 	/* Set decoder parameters to decode trace packets */
@@ -3462,9 +3415,9 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	/*
 	 * Map Trace ID values to CPU metadata.
 	 *
-	 * Trace metadata will always contain Trace ID values from the legacy algorithm. If the
-	 * files has been recorded by a "new" perf updated to handle AUX_HW_ID then the metadata
-	 * ID value will also have the CORESIGHT_TRACE_ID_UNUSED_FLAG set.
+	 * Trace metadata will always contain Trace ID values from the legacy algorithm
+	 * in case it's read by a version of Perf that doesn't know about HW_ID packets
+	 * or the kernel doesn't emit them.
 	 *
 	 * The updated kernel drivers that use AUX_HW_ID to sent Trace IDs will attempt to use
 	 * the same IDs as the old algorithm as far as is possible, unless there are clashes
@@ -3473,12 +3426,11 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	 *
 	 * For a perf able to interpret AUX_HW_ID packets we first check for the presence of
 	 * those packets. If they are there then the values will be mapped and plugged into
-	 * the metadata. We then set any remaining metadata values with the used flag to a
-	 * value CORESIGHT_TRACE_ID_UNUSED_VAL - which indicates no decoder is required.
+	 * the metadata and decoders are only created for each mapping received.
 	 *
 	 * If no AUX_HW_ID packets are present - which means a file recorded on an old kernel
-	 * then we map Trace ID values to CPU directly from the metadata - clearing any unused
-	 * flags if present.
+	 * then we map Trace ID values to CPU directly from the metadata and create decoders
+	 * for all mappings.
 	 */
 
 	/* Scan for AUX_OUTPUT_HW_ID records to map trace ID values to CPU metadata */
@@ -3489,15 +3441,12 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
 	if (err)
 		goto err_free_queues;
 
-	/* if HW ID found then clear any unused metadata ID values */
-	if (aux_hw_id_found)
-		err = cs_etm__clear_unused_trace_ids_metadata(num_cpu, metadata);
-	/* otherwise, this is a file with metadata values only, map from metadata */
-	else
+	/* if no HW ID found this is a file with metadata values only, map from metadata */
+	if (!aux_hw_id_found) {
 		err = cs_etm__map_trace_ids_metadata(etm, num_cpu, metadata);
-
-	if (err)
-		goto err_free_queues;
+		if (err)
+			goto err_free_queues;
+	}
 
 	err = cs_etm__create_decoders(etm);
 	if (err)
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index f4f69f7cc0f3..a8caeea720aa 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -230,16 +230,6 @@ struct cs_etm_packet_queue {
 /* CoreSight trace ID is currently the bottom 7 bits of the value */
 #define CORESIGHT_TRACE_ID_VAL_MASK	GENMASK(6, 0)
 
-/*
- * perf record will set the legacy meta data values as unused initially.
- * This allows perf report to manage the decoders created when dynamic
- * allocation in operation.
- */
-#define CORESIGHT_TRACE_ID_UNUSED_FLAG	BIT(31)
-
-/* Value to set for unused trace ID values */
-#define CORESIGHT_TRACE_ID_UNUSED_VAL	0x7F
-
 int cs_etm__process_auxtrace_info(union perf_event *event,
 				  struct perf_session *session);
 void cs_etm_get_default_config(const struct perf_pmu *pmu, struct perf_event_attr *attr);
-- 
cgit v1.2.3


From 940007cee539fd07c7c274030f7f7252f8c5a5d7 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:47 +0100
Subject: perf: cs-etm: Only save valid trace IDs into files

This isn't a bug because Perf always masks with
CORESIGHT_TRACE_ID_VAL_MASK before using these values, but to avoid it
looking like it could be, make an effort to not save bad values.

Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Torgue <alexandre.torgue@foss.st.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240722101202.26915-6-james.clark@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 3bbc1e57d71d..ea891d12f8f4 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -643,7 +643,8 @@ static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu)
 
 static __u64 cs_etm_get_legacy_trace_id(struct perf_cpu cpu)
 {
-	return CORESIGHT_LEGACY_CPU_TRACE_ID(cpu.cpu);
+	/* Wrap at 48 so that invalid trace IDs aren't saved into files. */
+	return CORESIGHT_LEGACY_CPU_TRACE_ID(cpu.cpu % 48);
 }
 
 static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, struct perf_cpu cpu)
-- 
cgit v1.2.3


From 1506af6db8c4abbe3d5dd573ec72b90f81abfcf7 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:48 +0100
Subject: perf: cs-etm: Support version 0.1 of HW_ID packets

v0.1 HW_ID packets have a new field that describes which sink each CPU
writes to. Use the sink ID to link trace ID maps to each other so that
mappings are shared wherever the sink is shared.

Also update the error message to show that overlapping IDs aren't an
error in per-thread mode, just not supported. In the future we can
use the CPU ID from the AUX records, or watch for changing sink IDs on
HW_ID packets to use the correct decoders.

Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Torgue <alexandre.torgue@foss.st.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240722101202.26915-7-james.clark@linaro.org
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/coresight-pmu.h |  17 ++++--
 tools/perf/util/cs-etm.c            | 100 ++++++++++++++++++++++++++++++++----
 2 files changed, 103 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h
index 51ac441a37c3..89b0ac0014b0 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -49,12 +49,21 @@
  * Interpretation of the PERF_RECORD_AUX_OUTPUT_HW_ID payload.
  * Used to associate a CPU with the CoreSight Trace ID.
  * [07:00] - Trace ID - uses 8 bits to make value easy to read in file.
- * [59:08] - Unused (SBZ)
- * [63:60] - Version
+ * [39:08] - Sink ID - as reported in /sys/bus/event_source/devices/cs_etm/sinks/
+ *	      Added in minor version 1.
+ * [55:40] - Unused (SBZ)
+ * [59:56] - Minor Version - previously existing fields are compatible with
+ *	      all minor versions.
+ * [63:60] - Major Version - previously existing fields mean different things
+ *	      in new major versions.
  */
 #define CS_AUX_HW_ID_TRACE_ID_MASK	GENMASK_ULL(7, 0)
-#define CS_AUX_HW_ID_VERSION_MASK	GENMASK_ULL(63, 60)
+#define CS_AUX_HW_ID_SINK_ID_MASK	GENMASK_ULL(39, 8)
 
-#define CS_AUX_HW_ID_CURR_VERSION 0
+#define CS_AUX_HW_ID_MINOR_VERSION_MASK	GENMASK_ULL(59, 56)
+#define CS_AUX_HW_ID_MAJOR_VERSION_MASK	GENMASK_ULL(63, 60)
+
+#define CS_AUX_HW_ID_MAJOR_VERSION 0
+#define CS_AUX_HW_ID_MINOR_VERSION 1
 
 #endif
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 01c60a660fb3..775d4f35c270 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -118,6 +118,12 @@ struct cs_etm_queue {
 	struct cs_etm_traceid_queue **traceid_queues;
 	/* Conversion between traceID and metadata pointers */
 	struct intlist *traceid_list;
+	/*
+	 * Same as traceid_list, but traceid_list may be a reference to another
+	 * queue's which has a matching sink ID.
+	 */
+	struct intlist *own_traceid_list;
+	u32 sink_id;
 };
 
 static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm);
@@ -142,6 +148,7 @@ static int cs_etm__metadata_set_trace_id(u8 trace_chan_id, u64 *cpu_metadata);
 		      (queue_nr << 16 | trace_chan_id)
 #define TO_QUEUE_NR(cs_queue_nr) (cs_queue_nr >> 16)
 #define TO_TRACE_CHAN_ID(cs_queue_nr) (cs_queue_nr & 0x0000ffff)
+#define SINK_UNSET ((u32) -1)
 
 static u32 cs_etm__get_v7_protocol_version(u32 etmidr)
 {
@@ -241,7 +248,16 @@ static int cs_etm__insert_trace_id_node(struct cs_etm_queue *etmq,
 		int err;
 
 		if (curr_cpu_data[CS_ETM_CPU] != cpu_metadata[CS_ETM_CPU]) {
-			pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n");
+			/*
+			 * With > CORESIGHT_TRACE_IDS_MAX ETMs, overlapping IDs
+			 * are expected (but not supported) in per-thread mode,
+			 * rather than signifying an error.
+			 */
+			if (etmq->etm->per_thread_decoding)
+				pr_err("CS_ETM: overlapping Trace IDs aren't currently supported in per-thread mode\n");
+			else
+				pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n");
+
 			return -EINVAL;
 		}
 
@@ -326,6 +342,64 @@ static int cs_etm__process_trace_id_v0(struct cs_etm_auxtrace *etm, int cpu,
 	return cs_etm__metadata_set_trace_id(trace_chan_id, cpu_data);
 }
 
+static int cs_etm__process_trace_id_v0_1(struct cs_etm_auxtrace *etm, int cpu,
+					 u64 hw_id)
+{
+	struct cs_etm_queue *etmq = cs_etm__get_queue(etm, cpu);
+	int ret;
+	u64 *cpu_data;
+	u32 sink_id = FIELD_GET(CS_AUX_HW_ID_SINK_ID_MASK, hw_id);
+	u8 trace_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id);
+
+	/*
+	 * Check sink id hasn't changed in per-cpu mode. In per-thread mode,
+	 * let it pass for now until an actual overlapping trace ID is hit. In
+	 * most cases IDs won't overlap even if the sink changes.
+	 */
+	if (!etmq->etm->per_thread_decoding && etmq->sink_id != SINK_UNSET &&
+	    etmq->sink_id != sink_id) {
+		pr_err("CS_ETM: mismatch between sink IDs\n");
+		return -EINVAL;
+	}
+
+	etmq->sink_id = sink_id;
+
+	/* Find which other queues use this sink and link their ID maps */
+	for (unsigned int i = 0; i < etm->queues.nr_queues; ++i) {
+		struct cs_etm_queue *other_etmq = etm->queues.queue_array[i].priv;
+
+		/* Different sinks, skip */
+		if (other_etmq->sink_id != etmq->sink_id)
+			continue;
+
+		/* Already linked, skip */
+		if (other_etmq->traceid_list == etmq->traceid_list)
+			continue;
+
+		/* At the point of first linking, this one should be empty */
+		if (!intlist__empty(etmq->traceid_list)) {
+			pr_err("CS_ETM: Can't link populated trace ID lists\n");
+			return -EINVAL;
+		}
+
+		etmq->own_traceid_list = NULL;
+		intlist__delete(etmq->traceid_list);
+		etmq->traceid_list = other_etmq->traceid_list;
+		break;
+	}
+
+	cpu_data = get_cpu_data(etm, cpu);
+	ret = cs_etm__insert_trace_id_node(etmq, trace_id, cpu_data);
+	if (ret)
+		return ret;
+
+	ret = cs_etm__metadata_set_trace_id(trace_id, cpu_data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata)
 {
 	u64 cs_etm_magic = cpu_metadata[CS_ETM_MAGIC];
@@ -414,10 +488,10 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session,
 
 	/* extract and parse the HW ID */
 	hw_id = event->aux_output_hw_id.hw_id;
-	version = FIELD_GET(CS_AUX_HW_ID_VERSION_MASK, hw_id);
+	version = FIELD_GET(CS_AUX_HW_ID_MAJOR_VERSION_MASK, hw_id);
 
 	/* check that we can handle this version */
-	if (version > CS_AUX_HW_ID_CURR_VERSION) {
+	if (version > CS_AUX_HW_ID_MAJOR_VERSION) {
 		pr_err("CS ETM Trace: PERF_RECORD_AUX_OUTPUT_HW_ID version %d not supported. Please update Perf.\n",
 		       version);
 		return -EINVAL;
@@ -442,7 +516,10 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session,
 		return -EINVAL;
 	}
 
-	return cs_etm__process_trace_id_v0(etm, cpu, hw_id);
+	if (FIELD_GET(CS_AUX_HW_ID_MINOR_VERSION_MASK, hw_id) == 0)
+		return cs_etm__process_trace_id_v0(etm, cpu, hw_id);
+
+	return cs_etm__process_trace_id_v0_1(etm, cpu, hw_id);
 }
 
 void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
@@ -882,12 +959,14 @@ static void cs_etm__free_queue(void *priv)
 	cs_etm_decoder__free(etmq->decoder);
 	cs_etm__free_traceid_queues(etmq);
 
-	/* First remove all traceID/metadata nodes for the RB tree */
-	intlist__for_each_entry_safe(inode, tmp, etmq->traceid_list)
-		intlist__remove(etmq->traceid_list, inode);
+	if (etmq->own_traceid_list) {
+		/* First remove all traceID/metadata nodes for the RB tree */
+		intlist__for_each_entry_safe(inode, tmp, etmq->own_traceid_list)
+			intlist__remove(etmq->own_traceid_list, inode);
 
-	/* Then the RB tree itself */
-	intlist__delete(etmq->traceid_list);
+		/* Then the RB tree itself */
+		intlist__delete(etmq->own_traceid_list);
+	}
 
 	free(etmq);
 }
@@ -1081,7 +1160,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(void)
 	 * has to be made for each packet that gets decoded, optimizing access
 	 * in anything other than a sequential array is worth doing.
 	 */
-	etmq->traceid_list = intlist__new(NULL);
+	etmq->traceid_list = etmq->own_traceid_list = intlist__new(NULL);
 	if (!etmq->traceid_list)
 		goto out_free;
 
@@ -1113,6 +1192,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 	etmq->queue_nr = queue_nr;
 	queue->cpu = queue_nr; /* Placeholder, may be reset to -1 in per-thread mode */
 	etmq->offset = 0;
+	etmq->sink_id = SINK_UNSET;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 022aa67b5ab9077d8f5bf02df5a7f0f2d4dfb909 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:49 +0100
Subject: perf: cs-etm: Print queue number in raw trace dump

Now that we have overlapping trace IDs it's also useful to know what the
queue number is to be able to distinguish the source of the trace so
print it inline. Hide it behind the -v option because it might not be
obvious to users what the queue number is.

Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Torgue <alexandre.torgue@foss.st.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240722101202.26915-8-james.clark@linaro.org
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cs-etm-decoder/cs-etm-decoder.c |  4 ++--
 tools/perf/util/cs-etm-decoder/cs-etm-decoder.h |  2 +-
 tools/perf/util/cs-etm.c                        | 13 ++++++++++---
 3 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index d49c3e9c7c21..b78ef0262135 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -41,7 +41,7 @@ const u32 INSTR_PER_NS = 10;
 
 struct cs_etm_decoder {
 	void *data;
-	void (*packet_printer)(const char *msg);
+	void (*packet_printer)(const char *msg, void *data);
 	bool suppress_printing;
 	dcd_tree_handle_t dcd_tree;
 	cs_etm_mem_cb_type mem_access;
@@ -202,7 +202,7 @@ static void cs_etm_decoder__print_str_cb(const void *p_context,
 	const struct cs_etm_decoder *decoder = p_context;
 
 	if (p_context && str_len && !decoder->suppress_printing)
-		decoder->packet_printer(msg);
+		decoder->packet_printer(msg, decoder->data);
 }
 
 static int
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
index 272c2efe78ee..12c782fa6db2 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
@@ -60,7 +60,7 @@ struct cs_etm_trace_params {
 
 struct cs_etm_decoder_params {
 	int operation;
-	void (*packet_printer)(const char *msg);
+	void (*packet_printer)(const char *msg, void *data);
 	cs_etm_mem_cb_type mem_acc_cb;
 	bool formatted;
 	bool fsyncs;
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 775d4f35c270..90f32f327b9b 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -762,15 +762,22 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
 	}
 }
 
-static void cs_etm__packet_dump(const char *pkt_string)
+static void cs_etm__packet_dump(const char *pkt_string, void *data)
 {
 	const char *color = PERF_COLOR_BLUE;
 	int len = strlen(pkt_string);
+	struct cs_etm_queue *etmq = data;
+	char queue_nr[64];
+
+	if (verbose)
+		snprintf(queue_nr, sizeof(queue_nr), "Qnr:%d; ", etmq->queue_nr);
+	else
+		queue_nr[0] = '\0';
 
 	if (len && (pkt_string[len-1] == '\n'))
-		color_fprintf(stdout, color, "	%s", pkt_string);
+		color_fprintf(stdout, color, "	%s%s", queue_nr, pkt_string);
 	else
-		color_fprintf(stdout, color, "	%s\n", pkt_string);
+		color_fprintf(stdout, color, "	%s%s\n", queue_nr, pkt_string);
 
 	fflush(stdout);
 }
-- 
cgit v1.2.3


From d9c993100ef1906bd1f25bab789390828ef10a41 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 29 Aug 2024 08:01:48 -0700
Subject: perf session: Document 'struct perf_session' and constify its
 'auxtrace' member

perf_session is a central data structure to the tool so let's comment
it. The auxtrace callbacks are never modified in session so constify.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240829150154.37929-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 7c8dd6956330..e56518639711 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -26,26 +26,72 @@ struct decomp_data {
 	struct zstd_data *zstd_decomp;
 };
 
+/**
+ * struct perf_session- A Perf session holds the main state when the program is
+ * working with live perf events or reading data from an input file.
+ *
+ * The rough organization of a perf_session is:
+ * ```
+ * +--------------+           +-----------+           +------------+
+ * |   Session    |1..* ----->|  Machine  |1..* ----->|   Thread   |
+ * +--------------+           +-----------+           +------------+
+ * ```
+ */
 struct perf_session {
+	/**
+	 * @header: The read version of a perf_file_header, or captures global
+	 * information from a live session.
+	 */
 	struct perf_header	header;
+	/** @machines: Machines within the session a host and 0 or more guests. */
 	struct machines		machines;
+	/** @evlist: List of evsels/events of the session. */
 	struct evlist	*evlist;
-	struct auxtrace		*auxtrace;
+	/** @auxtrace: callbacks to allow AUX area data decoding. */
+	const struct auxtrace	*auxtrace;
+	/** @itrace_synth_opts: AUX area tracing synthesis options. */
 	struct itrace_synth_opts *itrace_synth_opts;
+	/** @auxtrace_index: index of AUX area tracing events within a perf.data file. */
 	struct list_head	auxtrace_index;
 #ifdef HAVE_LIBTRACEEVENT
+	/** @tevent: handles for libtraceevent and plugins. */
 	struct trace_event	tevent;
 #endif
+	/** @time_conv: Holds contents of last PERF_RECORD_TIME_CONV event. */
 	struct perf_record_time_conv	time_conv;
+	/**
+	 * @repipe: When set causes certain reading (header and trace events) to
+	 * also write events. The written file descriptor must be provided for
+	 * the header but is implicitly stdout for trace events.
+	 */
 	bool			repipe;
+	/**
+	 * @one_mmap: The reader will use a single mmap by default. There may be
+	 * multiple data files in particular for aux events. If this is true
+	 * then the single big mmap for the data file can be assumed.
+	 */
 	bool			one_mmap;
+	/** @one_mmap_addr: Address of initial perf data file reader mmap. */
 	void			*one_mmap_addr;
+	/** @one_mmap_offset: File offset in perf.data file when mapped. */
 	u64			one_mmap_offset;
+	/** @ordered_events: Used to turn unordered events into ordered ones. */
 	struct ordered_events	ordered_events;
+	/** @data: Optional perf data file being read from. */
 	struct perf_data	*data;
+	/** @tool: callbacks for event handling. */
 	const struct perf_tool	*tool;
+	/**
+	 * @bytes_transferred: Used by perf record to count written bytes before
+	 * compression.
+	 */
 	u64			bytes_transferred;
+	/**
+	 * @bytes_compressed: Used by perf record to count written bytes after
+	 * compression.
+	 */
 	u64			bytes_compressed;
+	/** @zstd_data: Owner of global compression state, buffers, etc. */
 	struct zstd_data	zstd_data;
 	struct decomp_data	decomp_data;
 	struct decomp_data	*active_decomp;
-- 
cgit v1.2.3


From d71bbe799c0cda4a0581f98ea3e4d646a18e799d Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 29 Aug 2024 08:01:49 -0700
Subject: perf header: Add kerneldoc to 'struct perf_file_header'

Some of the values are a little strange so add documentation to
resolve ambiguity.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240829150154.37929-4-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 07ff647197ff..3285981948d7 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -61,14 +61,28 @@ struct perf_file_section {
 	u64 size;
 };
 
+/**
+ * struct perf_file_header: Header representation on disk.
+ */
 struct perf_file_header {
+	/** @magic: Holds "PERFILE2". */
 	u64				magic;
+	/** @size: Size of this header - sizeof(struct perf_file_header). */
 	u64				size;
+	/**
+	 * @attr_size: Size of attrs entries - sizeof(struct perf_event_attr) +
+	 * sizeof(struct perf_file_section).
+	 */
 	u64				attr_size;
+	/** @attrs: Offset and size of file section holding attributes. */
 	struct perf_file_section	attrs;
+	/** @data: Offset and size of file section holding regular event data. */
 	struct perf_file_section	data;
-	/* event_types is ignored */
+	/** @event_types: Ignored. */
 	struct perf_file_section	event_types;
+	/**
+	 * @adds_features: Bitmap of features. The features are immediately after the data section.
+	 */
 	DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
 };
 
-- 
cgit v1.2.3


From 10df481fda13d29c2d9893b7460b8a7095d0ccf5 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 29 Aug 2024 08:01:50 -0700
Subject: perf header: Fail read if header sections overlap

Buggy perf.data files can have the attributes and data
overlapping.

For example, when processing pipe data the attributes aren't known and
so file offset header calculations can consider them not present.

Later this can cause the attributes to overwrite the data. This can be
seen in:

  $ perf record -o - true > a.data
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.059 MB - ]
  $ perf inject -i a.data -o b.data
  $ perf report --stats -i b.data
  0x68 [0]: failed to process type: 510379 [Invalid argument]
  Error:
  failed to process sample
  $

This change makes reading the corrupt file fail:

  $ perf report --stats -i b.data
  Perf file header corrupt: Attributes and data overlap
  incompatible file format (rerun with -v to learn more)
  $

Which is more informative.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240829150154.37929-5-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 3309fe7f1d12..65c9086610cb 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3986,6 +3986,24 @@ int perf_file_header__read(struct perf_file_header *header,
 			     adds_features));
 	}
 
+	if (header->size > header->attrs.offset) {
+		pr_err("Perf file header corrupt: header overlaps attrs\n");
+		return -1;
+	}
+
+	if (header->size > header->data.offset) {
+		pr_err("Perf file header corrupt: header overlaps data\n");
+		return -1;
+	}
+
+	if ((header->attrs.offset <= header->data.offset &&
+	     header->attrs.offset + header->attrs.size > header->data.offset) ||
+	    (header->attrs.offset > header->data.offset &&
+	     header->data.offset + header->data.size > header->attrs.offset)) {
+		pr_err("Perf file header corrupt: Attributes and data overlap\n");
+		return -1;
+	}
+
 	if (header->size != sizeof(*header)) {
 		/* Support the previous format */
 		if (header->size == offsetof(typeof(*header), adds_features))
-- 
cgit v1.2.3


From e9a7053da377e19ebc6f1afe55c7f75219aaef79 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 29 Aug 2024 08:01:51 -0700
Subject: perf header: Allow attributes to be written after data

With a file, to write data an offset needs to be known. Typically data
follows the event attributes in a file.

However, if processing a pipe the number of event attributes may not be
known.

It is convenient in that case to write the attributes after the data.

Expand perf_session__do_write_header() to allow this when the data
offset and size are known.

This approach may be useful for more than just taking a pipe file to
write into a data file, `perf inject --itrace` will reserve and
additional 8kb for attributes, which would be unnecessary if the
attributes were written after the data.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240829150154.37929-6-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 106 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 67 insertions(+), 39 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 65c9086610cb..4eb39463067e 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3676,32 +3676,50 @@ int perf_header__write_pipe(int fd)
 static int perf_session__do_write_header(struct perf_session *session,
 					 struct evlist *evlist,
 					 int fd, bool at_exit,
-					 struct feat_copier *fc)
+					 struct feat_copier *fc,
+					 bool write_attrs_after_data)
 {
 	struct perf_file_header f_header;
-	struct perf_file_attr   f_attr;
 	struct perf_header *header = &session->header;
 	struct evsel *evsel;
 	struct feat_fd ff = {
 		.fd = fd,
 	};
-	u64 attr_offset;
+	u64 attr_offset = sizeof(f_header), attr_size = 0;
 	int err;
 
-	lseek(fd, sizeof(f_header), SEEK_SET);
+	if (write_attrs_after_data && at_exit) {
+		/*
+		 * Write features at the end of the file first so that
+		 * attributes may come after them.
+		 */
+		if (!header->data_offset && header->data_size) {
+			pr_err("File contains data but offset unknown\n");
+			err = -1;
+			goto err_out;
+		}
+		header->feat_offset = header->data_offset + header->data_size;
+		err = perf_header__adds_write(header, evlist, fd, fc);
+		if (err < 0)
+			goto err_out;
+		attr_offset = lseek(fd, 0, SEEK_CUR);
+	} else {
+		lseek(fd, attr_offset, SEEK_SET);
+	}
 
 	evlist__for_each_entry(session->evlist, evsel) {
-		evsel->id_offset = lseek(fd, 0, SEEK_CUR);
-		err = do_write(&ff, evsel->core.id, evsel->core.ids * sizeof(u64));
-		if (err < 0) {
-			pr_debug("failed to write perf header\n");
-			free(ff.buf);
-			return err;
+		evsel->id_offset = attr_offset;
+		/* Avoid writing at the end of the file until the session is exiting. */
+		if (!write_attrs_after_data || at_exit) {
+			err = do_write(&ff, evsel->core.id, evsel->core.ids * sizeof(u64));
+			if (err < 0) {
+				pr_debug("failed to write perf header\n");
+				goto err_out;
+			}
 		}
+		attr_offset += evsel->core.ids * sizeof(u64);
 	}
 
-	attr_offset = lseek(ff.fd, 0, SEEK_CUR);
-
 	evlist__for_each_entry(evlist, evsel) {
 		if (evsel->core.attr.size < sizeof(evsel->core.attr)) {
 			/*
@@ -3711,40 +3729,46 @@ static int perf_session__do_write_header(struct perf_session *session,
 			 */
 			evsel->core.attr.size = sizeof(evsel->core.attr);
 		}
-		f_attr = (struct perf_file_attr){
-			.attr = evsel->core.attr,
-			.ids  = {
-				.offset = evsel->id_offset,
-				.size   = evsel->core.ids * sizeof(u64),
+		/* Avoid writing at the end of the file until the session is exiting. */
+		if (!write_attrs_after_data || at_exit) {
+			struct perf_file_attr f_attr = {
+				.attr = evsel->core.attr,
+				.ids  = {
+					.offset = evsel->id_offset,
+					.size   = evsel->core.ids * sizeof(u64),
+				}
+			};
+			err = do_write(&ff, &f_attr, sizeof(f_attr));
+			if (err < 0) {
+				pr_debug("failed to write perf header attribute\n");
+				goto err_out;
 			}
-		};
-		err = do_write(&ff, &f_attr, sizeof(f_attr));
-		if (err < 0) {
-			pr_debug("failed to write perf header attribute\n");
-			free(ff.buf);
-			return err;
 		}
+		attr_size += sizeof(struct perf_file_attr);
 	}
 
-	if (!header->data_offset)
-		header->data_offset = lseek(fd, 0, SEEK_CUR);
+	if (!header->data_offset) {
+		if (write_attrs_after_data)
+			header->data_offset = sizeof(f_header);
+		else
+			header->data_offset = attr_offset + attr_size;
+	}
 	header->feat_offset = header->data_offset + header->data_size;
 
-	if (at_exit) {
+	if (!write_attrs_after_data && at_exit) {
+		/* Write features now feat_offset is known. */
 		err = perf_header__adds_write(header, evlist, fd, fc);
-		if (err < 0) {
-			free(ff.buf);
-			return err;
-		}
+		if (err < 0)
+			goto err_out;
 	}
 
 	f_header = (struct perf_file_header){
 		.magic	   = PERF_MAGIC,
 		.size	   = sizeof(f_header),
-		.attr_size = sizeof(f_attr),
+		.attr_size = sizeof(struct perf_file_attr),
 		.attrs = {
 			.offset = attr_offset,
-			.size   = evlist->core.nr_entries * sizeof(f_attr),
+			.size   = attr_size,
 		},
 		.data = {
 			.offset = header->data_offset,
@@ -3757,21 +3781,24 @@ static int perf_session__do_write_header(struct perf_session *session,
 
 	lseek(fd, 0, SEEK_SET);
 	err = do_write(&ff, &f_header, sizeof(f_header));
-	free(ff.buf);
 	if (err < 0) {
 		pr_debug("failed to write perf header\n");
-		return err;
+		goto err_out;
+	} else {
+		lseek(fd, 0, SEEK_END);
+		err = 0;
 	}
-	lseek(fd, header->data_offset + header->data_size, SEEK_SET);
-
-	return 0;
+err_out:
+	free(ff.buf);
+	return err;
 }
 
 int perf_session__write_header(struct perf_session *session,
 			       struct evlist *evlist,
 			       int fd, bool at_exit)
 {
-	return perf_session__do_write_header(session, evlist, fd, at_exit, NULL);
+	return perf_session__do_write_header(session, evlist, fd, at_exit, /*fc=*/NULL,
+					     /*write_attrs_after_data=*/false);
 }
 
 size_t perf_session__data_offset(const struct evlist *evlist)
@@ -3793,7 +3820,8 @@ int perf_session__inject_header(struct perf_session *session,
 				int fd,
 				struct feat_copier *fc)
 {
-	return perf_session__do_write_header(session, evlist, fd, true, fc);
+	return perf_session__do_write_header(session, evlist, fd, true, fc,
+					     /*write_attrs_after_data=*/false);
 }
 
 static int perf_header__getbuffer64(struct perf_header *header,
-- 
cgit v1.2.3


From bd0b4836a2333d5c725397d458c8edfa66f1d9bb Mon Sep 17 00:00:00 2001
From: Amery Hung <amery.hung@bytedance.com>
Date: Tue, 27 Aug 2024 01:13:01 +0000
Subject: selftests/bpf: Make sure stashed kptr in local kptr is freed
 recursively

When dropping a local kptr, any kptr stashed into it is supposed to be
freed through bpf_obj_free_fields->__bpf_obj_drop_impl recursively. Add a
test to make sure it happens.

The test first stashes a referenced kptr to "struct task" into a local
kptr and gets the reference count of the task. Then, it drops the local
kptr and reads the reference count of the task again. Since
bpf_obj_free_fields and __bpf_obj_drop_impl will go through the local kptr
recursively during bpf_obj_drop, the dtor of the stashed task kptr should
eventually be called. The second reference count should be one less than
the first one.

Signed-off-by: Amery Hung <amery.hung@bytedance.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240827011301.608620-1-amery.hung@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/task_kfunc_success.c       | 30 +++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
index 3138bb689b0b..a55149015063 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
@@ -143,8 +143,9 @@ int BPF_PROG(test_task_acquire_leave_in_map, struct task_struct *task, u64 clone
 SEC("tp_btf/task_newtask")
 int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
 {
-	struct task_struct *kptr;
+	struct task_struct *kptr, *acquired;
 	struct __tasks_kfunc_map_value *v, *local;
+	int refcnt, refcnt_after_drop;
 	long status;
 
 	if (!is_test_kfunc_task())
@@ -190,7 +191,34 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
 		return 0;
 	}
 
+	/* Stash a copy into local kptr and check if it is released recursively */
+	acquired = bpf_task_acquire(kptr);
+	if (!acquired) {
+		err = 7;
+		bpf_obj_drop(local);
+		bpf_task_release(kptr);
+		return 0;
+	}
+	bpf_probe_read_kernel(&refcnt, sizeof(refcnt), &acquired->rcu_users);
+
+	acquired = bpf_kptr_xchg(&local->task, acquired);
+	if (acquired) {
+		err = 8;
+		bpf_obj_drop(local);
+		bpf_task_release(kptr);
+		bpf_task_release(acquired);
+		return 0;
+	}
+
 	bpf_obj_drop(local);
+
+	bpf_probe_read_kernel(&refcnt_after_drop, sizeof(refcnt_after_drop), &kptr->rcu_users);
+	if (refcnt != refcnt_after_drop + 1) {
+		err = 9;
+		bpf_task_release(kptr);
+		return 0;
+	}
+
 	bpf_task_release(kptr);
 
 	return 0;
-- 
cgit v1.2.3


From 3d8806f37d318b10ddf9fa686821f58bc6301c7b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 28 Aug 2024 10:36:09 -0700
Subject: tools: ynl: error check scanf() in a sample
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Someone reported on GitHub that the YNL NIPA test is failing
when run locally. The test builds the tools, and it hits:

  netdev.c:82:9: warning: ignoring return value of ‘scanf’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
  82 | scanf("%d", &ifindex);

I can't repro this on my setups but error seems clear enough.

Link: https://github.com/linux-netdev/nipa/discussions/37
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Link: https://patch.msgid.link/20240828173609.2951335-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/samples/netdev.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c
index 3e7b29bd55d5..22609d44c89a 100644
--- a/tools/net/ynl/samples/netdev.c
+++ b/tools/net/ynl/samples/netdev.c
@@ -79,7 +79,10 @@ int main(int argc, char **argv)
 		goto err_close;
 
 	printf("Select ifc ($ifindex; or 0 = dump; or -2 ntf check): ");
-	scanf("%d", &ifindex);
+	if (scanf("%d", &ifindex) != 1) {
+		fprintf(stderr, "Error: unable to parse input\n");
+		goto err_destroy;
+	}
 
 	if (ifindex > 0) {
 		struct netdev_dev_get_req *req;
@@ -119,6 +122,7 @@ int main(int argc, char **argv)
 
 err_close:
 	fprintf(stderr, "YNL: %s\n", ys->err.msg);
+err_destroy:
 	ynl_sock_destroy(ys);
 	return 2;
 }
-- 
cgit v1.2.3


From b24d22ac74bfdde2e1146e1838abdc99131b1d19 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Wed, 28 Aug 2024 14:37:52 -0400
Subject: selftests: add selftest for tcp SO_PEEK_OFF support

We add a selftest to check that the new feature added in
commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option")
works correctly.

Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Tested-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Link: https://patch.msgid.link/20240828183752.660267-3-jmaloy@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile          |   1 +
 tools/testing/selftests/net/tcp_so_peek_off.c | 183 ++++++++++++++++++++++++++
 2 files changed, 184 insertions(+)
 create mode 100644 tools/testing/selftests/net/tcp_so_peek_off.c

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 8eaffd7a641c..1179e3261bef 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -80,6 +80,7 @@ TEST_PROGS += io_uring_zerocopy_tx.sh
 TEST_GEN_FILES += bind_bhash
 TEST_GEN_PROGS += sk_bind_sendto_listen
 TEST_GEN_PROGS += sk_connect_zero_addr
+TEST_GEN_PROGS += tcp_so_peek_off
 TEST_PROGS += test_ingress_egress_chaining.sh
 TEST_GEN_PROGS += so_incoming_cpu
 TEST_PROGS += sctp_vrf.sh
diff --git a/tools/testing/selftests/net/tcp_so_peek_off.c b/tools/testing/selftests/net/tcp_so_peek_off.c
new file mode 100644
index 000000000000..df8a39d9d3c3
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_so_peek_off.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include "../kselftest.h"
+
+static char *afstr(int af)
+{
+	return af == AF_INET ? "TCP/IPv4" : "TCP/IPv6";
+}
+
+int tcp_peek_offset_probe(sa_family_t af)
+{
+	int optv = 0;
+	int ret = 0;
+	int s;
+
+	s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		ksft_perror("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			ret = 1;
+		else
+			printf("%s does not support SO_PEEK_OFF\n", afstr(af));
+		close(s);
+	}
+	return ret;
+}
+
+static void tcp_peek_offset_set(int s, int offset)
+{
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		ksft_perror("Failed to set SO_PEEK_OFF value\n");
+}
+
+static int tcp_peek_offset_get(int s)
+{
+	int offset;
+	socklen_t len = sizeof(offset);
+
+	if (getsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, &len))
+		ksft_perror("Failed to get SO_PEEK_OFF value\n");
+	return offset;
+}
+
+static int tcp_peek_offset_test(sa_family_t af)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in a4;
+		struct sockaddr_in6 a6;
+	} a;
+	int res = 0;
+	int s[2] = {0, 0};
+	int recv_sock = 0;
+	int offset = 0;
+	ssize_t len;
+	char buf;
+
+	memset(&a, 0, sizeof(a));
+	a.sa.sa_family = af;
+
+	s[0] = socket(af, SOCK_STREAM, IPPROTO_TCP);
+	s[1] = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+
+	if (s[0] < 0 || s[1] < 0) {
+		ksft_perror("Temporary socket creation failed\n");
+		goto out;
+	}
+	if (bind(s[0], &a.sa, sizeof(a)) < 0) {
+		ksft_perror("Temporary socket bind() failed\n");
+		goto out;
+	}
+	if (getsockname(s[0], &a.sa, &((socklen_t) { sizeof(a) })) < 0) {
+		ksft_perror("Temporary socket getsockname() failed\n");
+		goto out;
+	}
+	if (listen(s[0], 0) < 0) {
+		ksft_perror("Temporary socket listen() failed\n");
+		goto out;
+	}
+	if (connect(s[1], &a.sa, sizeof(a)) >= 0 || errno != EINPROGRESS) {
+		ksft_perror("Temporary socket connect() failed\n");
+		goto out;
+	}
+	recv_sock = accept(s[0], NULL, NULL);
+	if (recv_sock <= 0) {
+		ksft_perror("Temporary socket accept() failed\n");
+		goto out;
+	}
+
+	/* Some basic tests of getting/setting offset */
+	offset = tcp_peek_offset_get(recv_sock);
+	if (offset != -1) {
+		ksft_perror("Initial value of socket offset not -1\n");
+		goto out;
+	}
+	tcp_peek_offset_set(recv_sock, 0);
+	offset = tcp_peek_offset_get(recv_sock);
+	if (offset != 0) {
+		ksft_perror("Failed to set socket offset to 0\n");
+		goto out;
+	}
+
+	/* Transfer a message */
+	if (send(s[1], (char *)("ab"), 2, 0) <= 0 || errno != EINPROGRESS) {
+		ksft_perror("Temporary probe socket send() failed\n");
+		goto out;
+	}
+	/* Read first byte */
+	len = recv(recv_sock, &buf, 1, MSG_PEEK);
+	if (len != 1 || buf != 'a') {
+		ksft_perror("Failed to read first byte of message\n");
+		goto out;
+	}
+	offset = tcp_peek_offset_get(recv_sock);
+	if (offset != 1) {
+		ksft_perror("Offset not forwarded correctly at first byte\n");
+		goto out;
+	}
+	/* Try to read beyond last byte */
+	len = recv(recv_sock, &buf, 2, MSG_PEEK);
+	if (len != 1 || buf != 'b') {
+		ksft_perror("Failed to read last byte of message\n");
+		goto out;
+	}
+	offset = tcp_peek_offset_get(recv_sock);
+	if (offset != 2) {
+		ksft_perror("Offset not forwarded correctly at last byte\n");
+		goto out;
+	}
+	/* Flush message */
+	len = recv(recv_sock, NULL, 2, MSG_TRUNC);
+	if (len != 2) {
+		ksft_perror("Failed to flush message\n");
+		goto out;
+	}
+	offset = tcp_peek_offset_get(recv_sock);
+	if (offset != 0) {
+		ksft_perror("Offset not reverted correctly after flush\n");
+		goto out;
+	}
+
+	printf("%s with MSG_PEEK_OFF works correctly\n", afstr(af));
+	res = 1;
+out:
+	if (recv_sock >= 0)
+		close(recv_sock);
+	if (s[1] >= 0)
+		close(s[1]);
+	if (s[0] >= 0)
+		close(s[0]);
+	return res;
+}
+
+int main(void)
+{
+	int res4, res6;
+
+	res4 = tcp_peek_offset_probe(AF_INET);
+	res6 = tcp_peek_offset_probe(AF_INET6);
+
+	if (!res4 && !res6)
+		return KSFT_SKIP;
+
+	if (res4)
+		res4 = tcp_peek_offset_test(AF_INET);
+
+	if (res6)
+		res6 = tcp_peek_offset_test(AF_INET6);
+
+	if (!res4 || !res6)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
-- 
cgit v1.2.3


From a0dbf6d0b21e197bf919591081ff2eb7a34ef982 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 29 Aug 2024 14:08:27 -0700
Subject: selftests/bpf: attach struct_ops maps before test prog runs

In test_loader based tests to bpf_map__attach_struct_ops()
before call to bpf_prog_test_run_opts() in order to trigger
bpf_struct_ops->reg() callbacks on kernel side.
This allows to use __retval macro for struct_ops tests.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240829210833.388152-6-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_loader.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 4223cffc090e..3e9b009580d4 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -890,11 +890,13 @@ void run_subtest(struct test_loader *tester,
 {
 	struct test_subspec *subspec = unpriv ? &spec->unpriv : &spec->priv;
 	struct bpf_program *tprog = NULL, *tprog_iter;
+	struct bpf_link *link, *links[32] = {};
 	struct test_spec *spec_iter;
 	struct cap_state caps = {};
 	struct bpf_object *tobj;
 	struct bpf_map *map;
 	int retval, err, i;
+	int links_cnt = 0;
 	bool should_load;
 
 	if (!test__start_subtest(subspec->name))
@@ -999,6 +1001,26 @@ void run_subtest(struct test_loader *tester,
 		if (restore_capabilities(&caps))
 			goto tobj_cleanup;
 
+		/* Do bpf_map__attach_struct_ops() for each struct_ops map.
+		 * This should trigger bpf_struct_ops->reg callback on kernel side.
+		 */
+		bpf_object__for_each_map(map, tobj) {
+			if (!bpf_map__autocreate(map) ||
+			    bpf_map__type(map) != BPF_MAP_TYPE_STRUCT_OPS)
+				continue;
+			if (links_cnt >= ARRAY_SIZE(links)) {
+				PRINT_FAIL("too many struct_ops maps");
+				goto tobj_cleanup;
+			}
+			link = bpf_map__attach_struct_ops(map);
+			if (!link) {
+				PRINT_FAIL("bpf_map__attach_struct_ops failed for map %s: err=%d\n",
+					   bpf_map__name(map), err);
+				goto tobj_cleanup;
+			}
+			links[links_cnt++] = link;
+		}
+
 		if (tester->pre_execution_cb) {
 			err = tester->pre_execution_cb(tobj);
 			if (err) {
@@ -1013,9 +1035,14 @@ void run_subtest(struct test_loader *tester,
 			PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval);
 			goto tobj_cleanup;
 		}
+		/* redo bpf_map__attach_struct_ops for each test */
+		while (links_cnt > 0)
+			bpf_link__destroy(links[--links_cnt]);
 	}
 
 tobj_cleanup:
+	while (links_cnt > 0)
+		bpf_link__destroy(links[--links_cnt]);
 	bpf_object__close(tobj);
 subtest_cleanup:
 	test__end_subtest();
-- 
cgit v1.2.3


From 47e69431b57aee6c3c134014e7fca490ca8ca9d1 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Aug 2024 14:08:28 -0700
Subject: selftests/bpf: Test gen_prologue and gen_epilogue

This test adds a new struct_ops "bpf_testmod_st_ops" in bpf_testmod.
The ops of the bpf_testmod_st_ops is triggered by new kfunc calls
"bpf_kfunc_st_ops_test_*logue". These new kfunc calls are
primarily used by the SEC("syscall") program. The test triggering
sequence is like:
    SEC("syscall")
    syscall_prologue(struct st_ops_args *args)
        bpf_kfunc_st_op_test_prologue(args)
	    st_ops->test_prologue(args)

.gen_prologue adds 1000 to args->a
.gen_epilogue adds 10000 to args->a
.gen_epilogue will also set the r0 to 2 * args->a.

The .gen_prologue and .gen_epilogue of the bpf_testmod_st_ops
will test the prog->aux->attach_func_name to decide if
it needs to generate codes.

The main programs of the pro_epilogue.c will call a
new kfunc bpf_kfunc_st_ops_inc10 which does "args->a += 10".
It will also call a subprog() which does "args->a += 1".

This patch uses the test_loader infra to check the __xlated
instructions patched after gen_prologue and/or gen_epilogue.
The __xlated check is based on Eduard's example (Thanks!) in v1.

args->a is returned by the struct_ops prog (either the main prog
or the epilogue). Thus, the __retval of the SEC("syscall") prog
is checked. For example, when triggering the ops in the
'SEC("struct_ops/test_epilogue") int test_epilogue'
The expected args->a is +1 (subprog call) + 10 (kfunc call)
    	     	     	+ 10000 (.gen_epilogue) = 10011.
The expected return value is 2 * 10011 (.gen_epilogue).

Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240829210833.388152-7-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        | 190 +++++++++++++++++++++
 .../selftests/bpf/bpf_testmod/bpf_testmod.h        |  11 ++
 .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h  |   6 +
 .../selftests/bpf/prog_tests/pro_epilogue.c        |  10 ++
 tools/testing/selftests/bpf/progs/pro_epilogue.c   | 154 +++++++++++++++++
 5 files changed, 371 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
 create mode 100644 tools/testing/selftests/bpf/progs/pro_epilogue.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 8a71a91b752d..42c38dbb6835 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -17,6 +17,7 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/un.h>
+#include <linux/filter.h>
 #include <net/sock.h>
 #include <linux/namei.h>
 #include "bpf_testmod.h"
@@ -945,6 +946,51 @@ out:
 	return err;
 }
 
+static DEFINE_MUTEX(st_ops_mutex);
+static struct bpf_testmod_st_ops *st_ops;
+
+__bpf_kfunc int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args)
+{
+	int ret = -1;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops && st_ops->test_prologue)
+		ret = st_ops->test_prologue(args);
+	mutex_unlock(&st_ops_mutex);
+
+	return ret;
+}
+
+__bpf_kfunc int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args)
+{
+	int ret = -1;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops && st_ops->test_epilogue)
+		ret = st_ops->test_epilogue(args);
+	mutex_unlock(&st_ops_mutex);
+
+	return ret;
+}
+
+__bpf_kfunc int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args)
+{
+	int ret = -1;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops && st_ops->test_pro_epilogue)
+		ret = st_ops->test_pro_epilogue(args);
+	mutex_unlock(&st_ops_mutex);
+
+	return ret;
+}
+
+__bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args)
+{
+	args->a += 10;
+	return args->a;
+}
+
 BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
@@ -981,6 +1027,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS)
 BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
 
 static int bpf_testmod_ops_init(struct btf *btf)
@@ -1100,6 +1150,144 @@ struct bpf_struct_ops bpf_testmod_ops2 = {
 	.owner = THIS_MODULE,
 };
 
+static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args)
+{
+	return 0;
+}
+
+static int bpf_test_mod_st_ops__test_epilogue(struct st_ops_args *args)
+{
+	return 0;
+}
+
+static int bpf_test_mod_st_ops__test_pro_epilogue(struct st_ops_args *args)
+{
+	return 0;
+}
+
+static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			       const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (strcmp(prog->aux->attach_func_name, "test_prologue") &&
+	    strcmp(prog->aux->attach_func_name, "test_pro_epilogue"))
+		return 0;
+
+	/* r6 = r1[0]; // r6 will be "struct st_ops *args". r1 is "u64 *ctx".
+	 * r7 = r6->a;
+	 * r7 += 1000;
+	 * r6->a = r7;
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0);
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_6, offsetof(struct st_ops_args, a));
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1000);
+	*insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, offsetof(struct st_ops_args, a));
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
+static int st_ops_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog,
+			       s16 ctx_stack_off)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (strcmp(prog->aux->attach_func_name, "test_epilogue") &&
+	    strcmp(prog->aux->attach_func_name, "test_pro_epilogue"))
+		return 0;
+
+	/* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx"
+	 * r1 = r1[0]; // r1 will be "struct st_ops *args"
+	 * r6 = r1->a;
+	 * r6 += 10000;
+	 * r1->a = r6;
+	 * r0 = r6;
+	 * r0 *= 2;
+	 * BPF_EXIT;
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off);
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, offsetof(struct st_ops_args, a));
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 10000);
+	*insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, offsetof(struct st_ops_args, a));
+	*insn++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_6);
+	*insn++ = BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, 2);
+	*insn++ = BPF_EXIT_INSN();
+
+	return insn - insn_buf;
+}
+
+static int st_ops_btf_struct_access(struct bpf_verifier_log *log,
+				    const struct bpf_reg_state *reg,
+				    int off, int size)
+{
+	if (off < 0 || off + size > sizeof(struct st_ops_args))
+		return -EACCES;
+	return 0;
+}
+
+static const struct bpf_verifier_ops st_ops_verifier_ops = {
+	.is_valid_access = bpf_testmod_ops_is_valid_access,
+	.btf_struct_access = st_ops_btf_struct_access,
+	.gen_prologue = st_ops_gen_prologue,
+	.gen_epilogue = st_ops_gen_epilogue,
+	.get_func_proto = bpf_base_func_proto,
+};
+
+static struct bpf_testmod_st_ops st_ops_cfi_stubs = {
+	.test_prologue = bpf_test_mod_st_ops__test_prologue,
+	.test_epilogue = bpf_test_mod_st_ops__test_epilogue,
+	.test_pro_epilogue = bpf_test_mod_st_ops__test_pro_epilogue,
+};
+
+static int st_ops_reg(void *kdata, struct bpf_link *link)
+{
+	int err = 0;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops) {
+		pr_err("st_ops has already been registered\n");
+		err = -EEXIST;
+		goto unlock;
+	}
+	st_ops = kdata;
+
+unlock:
+	mutex_unlock(&st_ops_mutex);
+	return err;
+}
+
+static void st_ops_unreg(void *kdata, struct bpf_link *link)
+{
+	mutex_lock(&st_ops_mutex);
+	st_ops = NULL;
+	mutex_unlock(&st_ops_mutex);
+}
+
+static int st_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static int st_ops_init_member(const struct btf_type *t,
+			      const struct btf_member *member,
+			      void *kdata, const void *udata)
+{
+	return 0;
+}
+
+static struct bpf_struct_ops testmod_st_ops = {
+	.verifier_ops = &st_ops_verifier_ops,
+	.init = st_ops_init,
+	.init_member = st_ops_init_member,
+	.reg = st_ops_reg,
+	.unreg = st_ops_unreg,
+	.cfi_stubs = &st_ops_cfi_stubs,
+	.name = "bpf_testmod_st_ops",
+	.owner = THIS_MODULE,
+};
+
 extern int bpf_fentry_test1(int a);
 
 static int bpf_testmod_init(void)
@@ -1117,8 +1305,10 @@ static int bpf_testmod_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_testmod_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_testmod_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set);
 	ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops);
 	ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2);
+	ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops);
 	ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors,
 						 ARRAY_SIZE(bpf_testmod_dtors),
 						 THIS_MODULE);
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
index fe0d402b0d65..fb7dff47597a 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
@@ -94,4 +94,15 @@ struct bpf_testmod_ops2 {
 	int (*test_1)(void);
 };
 
+struct st_ops_args {
+	u64 a;
+};
+
+struct bpf_testmod_st_ops {
+	int (*test_prologue)(struct st_ops_args *args);
+	int (*test_epilogue)(struct st_ops_args *args);
+	int (*test_pro_epilogue)(struct st_ops_args *args);
+	struct module *owner;
+};
+
 #endif /* _BPF_TESTMOD_H */
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
index c6c314965bb1..7e76532be5fa 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
@@ -148,4 +148,10 @@ struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head
 struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) __ksym;
 void bpf_kfunc_nested_release_test(struct sk_buff *ptr) __ksym;
 
+struct st_ops_args;
+int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args) __ksym;
+int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) __ksym;
+int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) __ksym;
+int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __ksym;
+
 #endif /* _BPF_TESTMOD_KFUNC_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
new file mode 100644
index 000000000000..00b806804b99
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "pro_epilogue.skel.h"
+
+void test_pro_epilogue(void)
+{
+	RUN_TESTS(pro_epilogue);
+}
diff --git a/tools/testing/selftests/bpf/progs/pro_epilogue.c b/tools/testing/selftests/bpf/progs/pro_epilogue.c
new file mode 100644
index 000000000000..44bc3f06b4b6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pro_epilogue.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod.h"
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+void __kfunc_btf_root(void)
+{
+	bpf_kfunc_st_ops_inc10(NULL);
+}
+
+static __noinline __used int subprog(struct st_ops_args *args)
+{
+	args->a += 1;
+	return args->a;
+}
+
+__success
+/* prologue */
+__xlated("0: r6 = *(u64 *)(r1 +0)")
+__xlated("1: r7 = *(u64 *)(r6 +0)")
+__xlated("2: r7 += 1000")
+__xlated("3: *(u64 *)(r6 +0) = r7")
+/* main prog */
+__xlated("4: r1 = *(u64 *)(r1 +0)")
+__xlated("5: r6 = r1")
+__xlated("6: call kernel-function")
+__xlated("7: r1 = r6")
+__xlated("8: call pc+1")
+__xlated("9: exit")
+SEC("struct_ops/test_prologue")
+__naked int test_prologue(void)
+{
+	asm volatile (
+	"r1 = *(u64 *)(r1 +0);"
+	"r6 = r1;"
+	"call %[bpf_kfunc_st_ops_inc10];"
+	"r1 = r6;"
+	"call subprog;"
+	"exit;"
+	:
+	: __imm(bpf_kfunc_st_ops_inc10)
+	: __clobber_all);
+}
+
+__success
+/* save __u64 *ctx to stack */
+__xlated("0: *(u64 *)(r10 -8) = r1")
+/* main prog */
+__xlated("1: r1 = *(u64 *)(r1 +0)")
+__xlated("2: r6 = r1")
+__xlated("3: call kernel-function")
+__xlated("4: r1 = r6")
+__xlated("5: call pc+")
+/* epilogue */
+__xlated("6: r1 = *(u64 *)(r10 -8)")
+__xlated("7: r1 = *(u64 *)(r1 +0)")
+__xlated("8: r6 = *(u64 *)(r1 +0)")
+__xlated("9: r6 += 10000")
+__xlated("10: *(u64 *)(r1 +0) = r6")
+__xlated("11: r0 = r6")
+__xlated("12: r0 *= 2")
+__xlated("13: exit")
+SEC("struct_ops/test_epilogue")
+__naked int test_epilogue(void)
+{
+	asm volatile (
+	"r1 = *(u64 *)(r1 +0);"
+	"r6 = r1;"
+	"call %[bpf_kfunc_st_ops_inc10];"
+	"r1 = r6;"
+	"call subprog;"
+	"exit;"
+	:
+	: __imm(bpf_kfunc_st_ops_inc10)
+	: __clobber_all);
+}
+
+__success
+/* prologue */
+__xlated("0: r6 = *(u64 *)(r1 +0)")
+__xlated("1: r7 = *(u64 *)(r6 +0)")
+__xlated("2: r7 += 1000")
+__xlated("3: *(u64 *)(r6 +0) = r7")
+/* save __u64 *ctx to stack */
+__xlated("4: *(u64 *)(r10 -8) = r1")
+/* main prog */
+__xlated("5: r1 = *(u64 *)(r1 +0)")
+__xlated("6: r6 = r1")
+__xlated("7: call kernel-function")
+__xlated("8: r1 = r6")
+__xlated("9: call pc+")
+/* epilogue */
+__xlated("10: r1 = *(u64 *)(r10 -8)")
+__xlated("11: r1 = *(u64 *)(r1 +0)")
+__xlated("12: r6 = *(u64 *)(r1 +0)")
+__xlated("13: r6 += 10000")
+__xlated("14: *(u64 *)(r1 +0) = r6")
+__xlated("15: r0 = r6")
+__xlated("16: r0 *= 2")
+__xlated("17: exit")
+SEC("struct_ops/test_pro_epilogue")
+__naked int test_pro_epilogue(void)
+{
+	asm volatile (
+	"r1 = *(u64 *)(r1 +0);"
+	"r6 = r1;"
+	"call %[bpf_kfunc_st_ops_inc10];"
+	"r1 = r6;"
+	"call subprog;"
+	"exit;"
+	:
+	: __imm(bpf_kfunc_st_ops_inc10)
+	: __clobber_all);
+}
+
+SEC("syscall")
+__retval(1011) /* PROLOGUE_A [1000] + KFUNC_INC10 + SUBPROG_A [1] */
+int syscall_prologue(void *ctx)
+{
+	struct st_ops_args args = {};
+
+	return bpf_kfunc_st_ops_test_prologue(&args);
+}
+
+SEC("syscall")
+__retval(20022) /* (KFUNC_INC10 + SUBPROG_A [1] + EPILOGUE_A [10000]) * 2 */
+int syscall_epilogue(void *ctx)
+{
+	struct st_ops_args args = {};
+
+	return bpf_kfunc_st_ops_test_epilogue(&args);
+}
+
+SEC("syscall")
+__retval(22022) /* (PROLOGUE_A [1000] + KFUNC_INC10 + SUBPROG_A [1] + EPILOGUE_A [10000]) * 2 */
+int syscall_pro_epilogue(void *ctx)
+{
+	struct st_ops_args args = {};
+
+	return bpf_kfunc_st_ops_test_pro_epilogue(&args);
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_st_ops pro_epilogue = {
+	.test_prologue = (void *)test_prologue,
+	.test_epilogue = (void *)test_epilogue,
+	.test_pro_epilogue = (void *)test_pro_epilogue,
+};
-- 
cgit v1.2.3


From b191b0fd740062ede672693671c6e6e942fb02f4 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Aug 2024 14:08:29 -0700
Subject: selftests/bpf: Add tailcall epilogue test

This patch adds a gen_epilogue test to test a main prog
using a bpf_tail_call.

A non test_loader test is used. The tailcall target program,
"test_epilogue_subprog", needs to be used in a struct_ops map
before it can be loaded. Another struct_ops map is also needed
to host the actual "test_epilogue_tailcall" struct_ops program
that does the bpf_tail_call. The earlier test_loader patch
will attach all struct_ops maps but the bpf_testmod.c does
not support >1 attached struct_ops.

The earlier patch used the test_loader which has already covered
checking for the patched pro/epilogue instructions. This is done
by the __xlated tag.

This patch goes for the regular skel load and syscall test to do
the tailcall test that can also allow to directly pass the
the "struct st_ops_args *args" as ctx_in to the
SEC("syscall") program.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240829210833.388152-8-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/pro_epilogue.c        | 46 +++++++++++++++++
 .../selftests/bpf/progs/epilogue_tailcall.c        | 58 ++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/epilogue_tailcall.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
index 00b806804b99..b82525c29de8 100644
--- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
+++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
@@ -3,8 +3,54 @@
 
 #include <test_progs.h>
 #include "pro_epilogue.skel.h"
+#include "epilogue_tailcall.skel.h"
+
+struct st_ops_args {
+	__u64 a;
+};
+
+static void test_tailcall(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct epilogue_tailcall *skel;
+	struct st_ops_args args;
+	int err, prog_fd;
+
+	skel = epilogue_tailcall__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "epilogue_tailcall__open_and_load"))
+		return;
+
+	topts.ctx_in = &args;
+	topts.ctx_size_in = sizeof(args);
+
+	skel->links.epilogue_tailcall =
+		bpf_map__attach_struct_ops(skel->maps.epilogue_tailcall);
+	if (!ASSERT_OK_PTR(skel->links.epilogue_tailcall, "attach_struct_ops"))
+		goto done;
+
+	/* Both test_epilogue_tailcall and test_epilogue_subprog are
+	 * patched with epilogue. When syscall_epilogue_tailcall()
+	 * is run, test_epilogue_tailcall() is triggered.
+	 * It executes a tail call and control is transferred to
+	 * test_epilogue_subprog(). Only test_epilogue_subprog()
+	 * does args->a += 1, thus final args.a value of 10001
+	 * guarantees that only the epilogue of the
+	 * test_epilogue_subprog is executed.
+	 */
+	memset(&args, 0, sizeof(args));
+	prog_fd = bpf_program__fd(skel->progs.syscall_epilogue_tailcall);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+	ASSERT_EQ(args.a, 10001, "args.a");
+	ASSERT_EQ(topts.retval, 10001 * 2, "topts.retval");
+
+done:
+	epilogue_tailcall__destroy(skel);
+}
 
 void test_pro_epilogue(void)
 {
 	RUN_TESTS(pro_epilogue);
+	if (test__start_subtest("tailcall"))
+		test_tailcall();
 }
diff --git a/tools/testing/selftests/bpf/progs/epilogue_tailcall.c b/tools/testing/selftests/bpf/progs/epilogue_tailcall.c
new file mode 100644
index 000000000000..7275dd594de0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/epilogue_tailcall.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod.h"
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+static __noinline __used int subprog(struct st_ops_args *args)
+{
+	args->a += 1;
+	return args->a;
+}
+
+SEC("struct_ops/test_epilogue_subprog")
+int BPF_PROG(test_epilogue_subprog, struct st_ops_args *args)
+{
+	subprog(args);
+	return args->a;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+	__array(values, void (void));
+} epilogue_map SEC(".maps") = {
+	.values = {
+		[0] = (void *)&test_epilogue_subprog,
+	}
+};
+
+SEC("struct_ops/test_epilogue_tailcall")
+int test_epilogue_tailcall(unsigned long long *ctx)
+{
+	bpf_tail_call(ctx, &epilogue_map, 0);
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_st_ops epilogue_tailcall = {
+	.test_epilogue = (void *)test_epilogue_tailcall,
+};
+
+SEC(".struct_ops.link")
+struct bpf_testmod_st_ops epilogue_subprog = {
+	.test_epilogue = (void *)test_epilogue_subprog,
+};
+
+SEC("syscall")
+int syscall_epilogue_tailcall(struct st_ops_args *args)
+{
+	return bpf_kfunc_st_ops_test_epilogue(args);
+}
-- 
cgit v1.2.3


From 42fdbbde6cf4159f77de13a40f8c0be6ef48bcc1 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Aug 2024 14:08:30 -0700
Subject: selftests/bpf: A pro/epilogue test when the main prog jumps back to
 the 1st insn

This patch adds a pro/epilogue test when the main prog has a goto insn
that goes back to the very first instruction of the prog. It is
to test the correctness of the adjust_jmp_off(prog, 0, delta)
after the verifier has applied the prologue and/or epilogue patch.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240829210833.388152-9-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/pro_epilogue.c        |   2 +
 .../selftests/bpf/progs/pro_epilogue_goto_start.c  | 149 +++++++++++++++++++++
 2 files changed, 151 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
index b82525c29de8..f974ae9ac610 100644
--- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
+++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
@@ -4,6 +4,7 @@
 #include <test_progs.h>
 #include "pro_epilogue.skel.h"
 #include "epilogue_tailcall.skel.h"
+#include "pro_epilogue_goto_start.skel.h"
 
 struct st_ops_args {
 	__u64 a;
@@ -51,6 +52,7 @@ done:
 void test_pro_epilogue(void)
 {
 	RUN_TESTS(pro_epilogue);
+	RUN_TESTS(pro_epilogue_goto_start);
 	if (test__start_subtest("tailcall"))
 		test_tailcall();
 }
diff --git a/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c b/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c
new file mode 100644
index 000000000000..3529e53be355
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod.h"
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+__success
+/* prologue */
+__xlated("0: r6 = *(u64 *)(r1 +0)")
+__xlated("1: r7 = *(u64 *)(r6 +0)")
+__xlated("2: r7 += 1000")
+__xlated("3: *(u64 *)(r6 +0) = r7")
+/* main prog */
+__xlated("4: if r1 == 0x0 goto pc+5")
+__xlated("5: if r1 == 0x1 goto pc+2")
+__xlated("6: r1 = 1")
+__xlated("7: goto pc-3")
+__xlated("8: r1 = 0")
+__xlated("9: goto pc-6")
+__xlated("10: r0 = 0")
+__xlated("11: exit")
+SEC("struct_ops/test_prologue_goto_start")
+__naked int test_prologue_goto_start(void)
+{
+	asm volatile (
+	"if r1 == 0 goto +5;"
+	"if r1 == 1 goto +2;"
+	"r1 = 1;"
+	"goto -3;"
+	"r1 = 0;"
+	"goto -6;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+__success
+/* save __u64 *ctx to stack */
+__xlated("0: *(u64 *)(r10 -8) = r1")
+/* main prog */
+__xlated("1: if r1 == 0x0 goto pc+5")
+__xlated("2: if r1 == 0x1 goto pc+2")
+__xlated("3: r1 = 1")
+__xlated("4: goto pc-3")
+__xlated("5: r1 = 0")
+__xlated("6: goto pc-6")
+__xlated("7: r0 = 0")
+/* epilogue */
+__xlated("8: r1 = *(u64 *)(r10 -8)")
+__xlated("9: r1 = *(u64 *)(r1 +0)")
+__xlated("10: r6 = *(u64 *)(r1 +0)")
+__xlated("11: r6 += 10000")
+__xlated("12: *(u64 *)(r1 +0) = r6")
+__xlated("13: r0 = r6")
+__xlated("14: r0 *= 2")
+__xlated("15: exit")
+SEC("struct_ops/test_epilogue_goto_start")
+__naked int test_epilogue_goto_start(void)
+{
+	asm volatile (
+	"if r1 == 0 goto +5;"
+	"if r1 == 1 goto +2;"
+	"r1 = 1;"
+	"goto -3;"
+	"r1 = 0;"
+	"goto -6;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+__success
+/* prologue */
+__xlated("0: r6 = *(u64 *)(r1 +0)")
+__xlated("1: r7 = *(u64 *)(r6 +0)")
+__xlated("2: r7 += 1000")
+__xlated("3: *(u64 *)(r6 +0) = r7")
+/* save __u64 *ctx to stack */
+__xlated("4: *(u64 *)(r10 -8) = r1")
+/* main prog */
+__xlated("5: if r1 == 0x0 goto pc+5")
+__xlated("6: if r1 == 0x1 goto pc+2")
+__xlated("7: r1 = 1")
+__xlated("8: goto pc-3")
+__xlated("9: r1 = 0")
+__xlated("10: goto pc-6")
+__xlated("11: r0 = 0")
+/* epilogue */
+__xlated("12: r1 = *(u64 *)(r10 -8)")
+__xlated("13: r1 = *(u64 *)(r1 +0)")
+__xlated("14: r6 = *(u64 *)(r1 +0)")
+__xlated("15: r6 += 10000")
+__xlated("16: *(u64 *)(r1 +0) = r6")
+__xlated("17: r0 = r6")
+__xlated("18: r0 *= 2")
+__xlated("19: exit")
+SEC("struct_ops/test_pro_epilogue_goto_start")
+__naked int test_pro_epilogue_goto_start(void)
+{
+	asm volatile (
+	"if r1 == 0 goto +5;"
+	"if r1 == 1 goto +2;"
+	"r1 = 1;"
+	"goto -3;"
+	"r1 = 0;"
+	"goto -6;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_st_ops epilogue_goto_start = {
+	.test_prologue = (void *)test_prologue_goto_start,
+	.test_epilogue = (void *)test_epilogue_goto_start,
+	.test_pro_epilogue = (void *)test_pro_epilogue_goto_start,
+};
+
+SEC("syscall")
+__retval(0)
+int syscall_prologue_goto_start(void *ctx)
+{
+	struct st_ops_args args = {};
+
+	return bpf_kfunc_st_ops_test_prologue(&args);
+}
+
+SEC("syscall")
+__retval(20000) /* (EPILOGUE_A [10000]) * 2 */
+int syscall_epilogue_goto_start(void *ctx)
+{
+	struct st_ops_args args = {};
+
+	return bpf_kfunc_st_ops_test_epilogue(&args);
+}
+
+SEC("syscall")
+__retval(22000) /* (PROLOGUE_A [1000] + EPILOGUE_A [10000]) * 2 */
+int syscall_pro_epilogue_goto_start(void *ctx)
+{
+	struct st_ops_args args = {};
+
+	return bpf_kfunc_st_ops_test_pro_epilogue(&args);
+}
-- 
cgit v1.2.3


From cada0bdcc471443dbd41bd63286f48be3dae0d89 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Aug 2024 14:08:31 -0700
Subject: selftests/bpf: Test epilogue patching when the main prog has multiple
 BPF_EXIT

This patch tests the epilogue patching when the main prog has
multiple BPF_EXIT. The verifier should have patched the 2nd (and
later) BPF_EXIT with a BPF_JA that goes back to the earlier
patched epilogue instructions.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240829210833.388152-10-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/pro_epilogue.c        |  2 +
 tools/testing/selftests/bpf/progs/epilogue_exit.c  | 82 ++++++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/epilogue_exit.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
index f974ae9ac610..509883e6823a 100644
--- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
+++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c
@@ -5,6 +5,7 @@
 #include "pro_epilogue.skel.h"
 #include "epilogue_tailcall.skel.h"
 #include "pro_epilogue_goto_start.skel.h"
+#include "epilogue_exit.skel.h"
 
 struct st_ops_args {
 	__u64 a;
@@ -53,6 +54,7 @@ void test_pro_epilogue(void)
 {
 	RUN_TESTS(pro_epilogue);
 	RUN_TESTS(pro_epilogue_goto_start);
+	RUN_TESTS(epilogue_exit);
 	if (test__start_subtest("tailcall"))
 		test_tailcall();
 }
diff --git a/tools/testing/selftests/bpf/progs/epilogue_exit.c b/tools/testing/selftests/bpf/progs/epilogue_exit.c
new file mode 100644
index 000000000000..33d3a57bee90
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/epilogue_exit.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod.h"
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+__success
+/* save __u64 *ctx to stack */
+__xlated("0: *(u64 *)(r10 -8) = r1")
+/* main prog */
+__xlated("1: r1 = *(u64 *)(r1 +0)")
+__xlated("2: r2 = *(u64 *)(r1 +0)")
+__xlated("3: r3 = 0")
+__xlated("4: r4 = 1")
+__xlated("5: if r2 == 0x0 goto pc+10")
+__xlated("6: r0 = 0")
+__xlated("7: *(u64 *)(r1 +0) = r3")
+/* epilogue */
+__xlated("8: r1 = *(u64 *)(r10 -8)")
+__xlated("9: r1 = *(u64 *)(r1 +0)")
+__xlated("10: r6 = *(u64 *)(r1 +0)")
+__xlated("11: r6 += 10000")
+__xlated("12: *(u64 *)(r1 +0) = r6")
+__xlated("13: r0 = r6")
+__xlated("14: r0 *= 2")
+__xlated("15: exit")
+/* 2nd part of the main prog after the first exit */
+__xlated("16: *(u64 *)(r1 +0) = r4")
+__xlated("17: r0 = 1")
+/* Clear the r1 to ensure it does not have
+ * off-by-1 error and ensure it jumps back to the
+ * beginning of epilogue which initializes
+ * the r1 with the ctx ptr.
+ */
+__xlated("18: r1 = 0")
+__xlated("19: gotol pc-12")
+SEC("struct_ops/test_epilogue_exit")
+__naked int test_epilogue_exit(void)
+{
+	asm volatile (
+	"r1 = *(u64 *)(r1 +0);"
+	"r2 = *(u64 *)(r1 +0);"
+	"r3 = 0;"
+	"r4 = 1;"
+	"if r2 == 0 goto +3;"
+	"r0 = 0;"
+	"*(u64 *)(r1 + 0) = r3;"
+	"exit;"
+	"*(u64 *)(r1 + 0) = r4;"
+	"r0 = 1;"
+	"r1 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_st_ops epilogue_exit = {
+	.test_epilogue = (void *)test_epilogue_exit,
+};
+
+SEC("syscall")
+__retval(20000)
+int syscall_epilogue_exit0(void *ctx)
+{
+	struct st_ops_args args = { .a = 1 };
+
+	return bpf_kfunc_st_ops_test_epilogue(&args);
+}
+
+SEC("syscall")
+__retval(20002)
+int syscall_epilogue_exit1(void *ctx)
+{
+	struct st_ops_args args = {};
+
+	return bpf_kfunc_st_ops_test_epilogue(&args);
+}
-- 
cgit v1.2.3


From 7c5f7b16fe1b9d1eb0cbb46d20f57db4a912b6e0 Mon Sep 17 00:00:00 2001
From: Juntong Deng <juntong.deng@outlook.com>
Date: Thu, 29 Aug 2024 21:13:15 +0100
Subject: selftests/bpf: Add tests for iter next method returning valid pointer

This patch adds test cases for iter next method returning valid
pointer, which can also used as usage examples.

Currently iter next method should return valid pointer.

iter_next_trusted is the correct usage and test if iter next method
return valid pointer. bpf_iter_task_vma_next has KF_RET_NULL flag,
so the returned pointer may be NULL. We need to check if the pointer
is NULL before using it.

iter_next_trusted_or_null is the incorrect usage. There is no checking
before using the pointer, so it will be rejected by the verifier.

iter_next_rcu and iter_next_rcu_or_null are similar test cases for
KF_RCU_PROTECTED iterators.

iter_next_rcu_not_trusted is used to test that the pointer returned by
iter next method of KF_RCU_PROTECTED iterator cannot be passed in
KF_TRUSTED_ARGS kfuncs.

iter_next_ptr_mem_not_trusted is used to test that base type
PTR_TO_MEM should not be combined with type flag PTR_TRUSTED.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
Link: https://lore.kernel.org/r/AM6PR03MB5848709758F6922F02AF9F1F99962@AM6PR03MB5848.eurprd03.prod.outlook.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  20 ++++
 .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h  |   5 +
 tools/testing/selftests/bpf/prog_tests/iters.c     |   5 +-
 tools/testing/selftests/bpf/progs/iters_testmod.c  | 125 +++++++++++++++++++++
 4 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/iters_testmod.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 42c38dbb6835..c73d04bc9e9d 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -198,6 +198,22 @@ __bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr)
 {
 }
 
+__bpf_kfunc void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_trusted_task_test(struct task_struct *ptr)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_trusted_num_test(int *ptr)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr)
+{
+}
+
 __bpf_kfunc struct bpf_testmod_ctx *
 bpf_testmod_ctx_create(int *err)
 {
@@ -559,6 +575,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test)
 BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE)
 BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE)
 BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU)
 BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE)
 BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
index 7e76532be5fa..b58817938deb 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
@@ -154,4 +154,9 @@ int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) __ksym;
 int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) __ksym;
 int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __ksym;
 
+void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) __ksym;
+void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym;
+void bpf_kfunc_trusted_num_test(int *ptr) __ksym;
+void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym;
+
 #endif /* _BPF_TESTMOD_KFUNC_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c
index 3c440370c1f0..89ff23c4a8bc 100644
--- a/tools/testing/selftests/bpf/prog_tests/iters.c
+++ b/tools/testing/selftests/bpf/prog_tests/iters.c
@@ -14,6 +14,7 @@
 #include "iters_state_safety.skel.h"
 #include "iters_looping.skel.h"
 #include "iters_num.skel.h"
+#include "iters_testmod.skel.h"
 #include "iters_testmod_seq.skel.h"
 #include "iters_task_vma.skel.h"
 #include "iters_task.skel.h"
@@ -297,8 +298,10 @@ void test_iters(void)
 	RUN_TESTS(iters);
 	RUN_TESTS(iters_css_task);
 
-	if (env.has_testmod)
+	if (env.has_testmod) {
+		RUN_TESTS(iters_testmod);
 		RUN_TESTS(iters_testmod_seq);
+	}
 
 	if (test__start_subtest("num"))
 		subtest_num_iters();
diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c
new file mode 100644
index 000000000000..df1d3db60b1b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iters_testmod.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include "bpf_experimental.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("raw_tp/sys_enter")
+__success
+int iter_next_trusted(const void *ctx)
+{
+	struct task_struct *cur_task = bpf_get_current_task_btf();
+	struct bpf_iter_task_vma vma_it;
+	struct vm_area_struct *vma_ptr;
+
+	bpf_iter_task_vma_new(&vma_it, cur_task, 0);
+
+	vma_ptr = bpf_iter_task_vma_next(&vma_it);
+	if (vma_ptr == NULL)
+		goto out;
+
+	bpf_kfunc_trusted_vma_test(vma_ptr);
+out:
+	bpf_iter_task_vma_destroy(&vma_it);
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
+int iter_next_trusted_or_null(const void *ctx)
+{
+	struct task_struct *cur_task = bpf_get_current_task_btf();
+	struct bpf_iter_task_vma vma_it;
+	struct vm_area_struct *vma_ptr;
+
+	bpf_iter_task_vma_new(&vma_it, cur_task, 0);
+
+	vma_ptr = bpf_iter_task_vma_next(&vma_it);
+
+	bpf_kfunc_trusted_vma_test(vma_ptr);
+
+	bpf_iter_task_vma_destroy(&vma_it);
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+__success
+int iter_next_rcu(const void *ctx)
+{
+	struct task_struct *cur_task = bpf_get_current_task_btf();
+	struct bpf_iter_task task_it;
+	struct task_struct *task_ptr;
+
+	bpf_iter_task_new(&task_it, cur_task, 0);
+
+	task_ptr = bpf_iter_task_next(&task_it);
+	if (task_ptr == NULL)
+		goto out;
+
+	bpf_kfunc_rcu_task_test(task_ptr);
+out:
+	bpf_iter_task_destroy(&task_it);
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
+int iter_next_rcu_or_null(const void *ctx)
+{
+	struct task_struct *cur_task = bpf_get_current_task_btf();
+	struct bpf_iter_task task_it;
+	struct task_struct *task_ptr;
+
+	bpf_iter_task_new(&task_it, cur_task, 0);
+
+	task_ptr = bpf_iter_task_next(&task_it);
+
+	bpf_kfunc_rcu_task_test(task_ptr);
+
+	bpf_iter_task_destroy(&task_it);
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+__failure __msg("R1 must be referenced or trusted")
+int iter_next_rcu_not_trusted(const void *ctx)
+{
+	struct task_struct *cur_task = bpf_get_current_task_btf();
+	struct bpf_iter_task task_it;
+	struct task_struct *task_ptr;
+
+	bpf_iter_task_new(&task_it, cur_task, 0);
+
+	task_ptr = bpf_iter_task_next(&task_it);
+	if (task_ptr == NULL)
+		goto out;
+
+	bpf_kfunc_trusted_task_test(task_ptr);
+out:
+	bpf_iter_task_destroy(&task_it);
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+__failure __msg("R1 cannot write into rdonly_mem")
+/* Message should not be 'R1 cannot write into rdonly_trusted_mem' */
+int iter_next_ptr_mem_not_trusted(const void *ctx)
+{
+	struct bpf_iter_num num_it;
+	int *num_ptr;
+
+	bpf_iter_num_new(&num_it, 0, 10);
+
+	num_ptr = bpf_iter_num_next(&num_it);
+	if (num_ptr == NULL)
+		goto out;
+
+	bpf_kfunc_trusted_num_test(num_ptr);
+out:
+	bpf_iter_num_destroy(&num_it);
+	return 0;
+}
-- 
cgit v1.2.3


From 9a7db819a184c21f605a4364131762d5db0c7010 Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Tue, 20 Aug 2024 00:42:42 -0700
Subject: crypto: tools/ccp - Remove unused variable

the variable is never referenced in the code, just remove them.

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 tools/crypto/ccp/dbc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/crypto/ccp/dbc.c b/tools/crypto/ccp/dbc.c
index a807df0f0597..80248d3d3a5a 100644
--- a/tools/crypto/ccp/dbc.c
+++ b/tools/crypto/ccp/dbc.c
@@ -57,7 +57,6 @@ int process_param(int fd, int msg_index, __u8 *signature, int *data)
 		.msg_index = msg_index,
 		.param = *data,
 	};
-	int ret;
 
 	assert(signature);
 	assert(data);
-- 
cgit v1.2.3


From 6f0315330af7a57c1c00587fdfb69c7778bf1c50 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 29 Aug 2024 18:20:09 +0100
Subject: kselftest/arm64: Actually test SME vector length changes via
 sigreturn

The test case for SME vector length changes via sigreturn use a bit too
much cut'n'paste and only actually changed the SVE vector length in the
test itself. Andre's recent factoring out of the initialisation code caused
this to be exposed and the test to start failing. Fix the test to actually
cover the thing it's supposed to test.

Fixes: 4963aeb35a9e ("kselftest/arm64: signal: Add SME signal handling tests")
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Tested-by: Andre Przywara <andre.przywara@arm.com>
Link: https://lore.kernel.org/r/20240829-arm64-sme-signal-vl-change-test-v1-1-42d7534cb818@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../arm64/signal/testcases/fake_sigreturn_sme_change_vl.c  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c
index cb8c051b5c8f..dfd6a2badf9f 100644
--- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c
@@ -35,30 +35,30 @@ static int fake_sigreturn_ssve_change_vl(struct tdescr *td,
 {
 	size_t resv_sz, offset;
 	struct _aarch64_ctx *head = GET_SF_RESV_HEAD(sf);
-	struct sve_context *sve;
+	struct za_context *za;
 
 	/* Get a signal context with a SME ZA frame in it */
 	if (!get_current_context(td, &sf.uc, sizeof(sf.uc)))
 		return 1;
 
 	resv_sz = GET_SF_RESV_SIZE(sf);
-	head = get_header(head, SVE_MAGIC, resv_sz, &offset);
+	head = get_header(head, ZA_MAGIC, resv_sz, &offset);
 	if (!head) {
-		fprintf(stderr, "No SVE context\n");
+		fprintf(stderr, "No ZA context\n");
 		return 1;
 	}
 
-	if (head->size != sizeof(struct sve_context)) {
+	if (head->size != sizeof(struct za_context)) {
 		fprintf(stderr, "Register data present, aborting\n");
 		return 1;
 	}
 
-	sve = (struct sve_context *)head;
+	za = (struct za_context *)head;
 
 	/* No changes are supported; init left us at minimum VL so go to max */
 	fprintf(stderr, "Attempting to change VL from %d to %d\n",
-		sve->vl, vls[0]);
-	sve->vl = vls[0];
+		za->vl, vls[0]);
+	za->vl = vls[0];
 
 	fake_sigreturn(&sf, sizeof(sf), 0);
 
-- 
cgit v1.2.3


From 89d64e72732f3f970a048a46a7b60164b34dfc43 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 29 Aug 2024 08:01:52 -0700
Subject: perf inject: Overhaul handling of pipe files

Previously inject->is_pipe was set if the input or output were a
pipe. Determining the input was a pipe had to be done prior to
starting the session and opening the file. This was done by comparing
the input file name with '-' but it fails if the pipe file is written
to disk.

Opening a pipe file from disk will correctly set perf_data.is_pipe, but
this is too late for 'perf inject' and results in a broken file. A
workaround is 'cat pipe_perf|perf inject -i - ...'.

This change removes inject->is_pipe and changes the dependent
conditions to use the is_pipe flag on the input
(inject->session->data) and output files (inject->output). This
ensures the is_pipe condition reflects things like the header being
read.

The change removes the use of perf file header repiping, that is
writing the file header out while reading it in. The case of input
pipe and output file cannot repipe as the attributes for the file are
unknown. To resolve this, write the file header when writing to disk
and as the attributes may be unknown, write them after the data.

Update sessions repipe variable to be trace_event_repipe as those are
the only events now impacted by it. Update __perf_session__new as the
repipe_fd no longer needs passing. Fully removing repipe from session
header reading will be done in a later change.

Committer testing:

  root@number:~# perf record -e syscalls:sys_enter_*sleep/max-stack=4/ -o - sleep 0.01 | perf report -i -
  # To display the perf.data header info, please use --header/--header-only options.
  #
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.050 MB - ]
  #
  # Total Lost Samples: 0
  #
  # Samples: 1  of event 'syscalls:sys_enter_clock_nanosleep'
  # Event count (approx.): 1
  #
  # Overhead  Command  Shared Object  Symbol
  # ........  .......  .............  ...............................
  #
     100.00%  sleep    libc.so.6      [.] clock_nanosleep@GLIBC_2.2.5
              |
              ---__libc_start_main@@GLIBC_2.34
                 __libc_start_call_main
                 0x562fc2560a9f
                 clock_nanosleep@GLIBC_2.2.5

  #
  # (Tip: Create an archive with symtabs to analyse on other machine: perf archive)
  #
  root@number:~# perf record -e syscalls:sys_enter_*sleep/max-stack=4/ -o - sleep 0.01 > pipe.data
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.050 MB - ]
  root@number:~# perf report --stdio -i pipe.data
  # To display the perf.data header info, please use --header/--header-only options.
  #
  #
  # Total Lost Samples: 0
  #
  # Samples: 1  of event 'syscalls:sys_enter_clock_nanosleep'
  # Event count (approx.): 1
  #
  # Overhead  Command  Shared Object  Symbol
  # ........  .......  .............  ...............................
  #
     100.00%  sleep    libc.so.6      [.] clock_nanosleep@GLIBC_2.2.5
              |
              ---__libc_start_main@@GLIBC_2.34
                 __libc_start_call_main
                 0x55f775975a9f
                 clock_nanosleep@GLIBC_2.2.5

  #
  # (Tip: To set sampling period of individual events use perf record -e cpu/cpu-cycles,period=100001/,cpu/branches,period=10001/ ...)
  #
  root@number:~#

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240829150154.37929-7-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 60 +++++++++++++++++++++++----------------------
 tools/perf/util/header.c    | 12 ++++-----
 tools/perf/util/header.h    |  3 ++-
 tools/perf/util/session.c   |  8 +++---
 tools/perf/util/session.h   | 14 ++++-------
 5 files changed, 48 insertions(+), 49 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index a7c859db2e15..0ccf80fe8399 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -119,7 +119,6 @@ struct perf_inject {
 	bool			jit_mode;
 	bool			in_place_update;
 	bool			in_place_update_dry_run;
-	bool			is_pipe;
 	bool			copy_kcore_dir;
 	const char		*input_name;
 	struct perf_data	output;
@@ -205,7 +204,8 @@ static int perf_event__repipe_attr(const struct perf_tool *tool,
 	if (ret)
 		return ret;
 
-	if (!inject->is_pipe)
+	/* If the output isn't a pipe then the attributes will be written as part of the header. */
+	if (!inject->output.is_pipe)
 		return 0;
 
 	return perf_event__repipe_synth(tool, event);
@@ -1966,7 +1966,13 @@ static int __cmd_inject(struct perf_inject *inject)
 	struct guest_session *gs = &inject->guest_session;
 	struct perf_session *session = inject->session;
 	int fd = output_fd(inject);
-	u64 output_data_offset;
+	u64 output_data_offset = perf_session__data_offset(session->evlist);
+	/*
+	 * Pipe input hasn't loaded the attributes and will handle them as
+	 * events. So that the attributes don't overlap the data, write the
+	 * attributes after the data.
+	 */
+	bool write_attrs_after_data = !inject->output.is_pipe && inject->session->data->is_pipe;
 
 	signal(SIGINT, sig_handler);
 
@@ -1980,8 +1986,6 @@ static int __cmd_inject(struct perf_inject *inject)
 #endif
 	}
 
-	output_data_offset = perf_session__data_offset(session->evlist);
-
 	if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
 		inject->tool.sample = perf_event__inject_buildid;
 	} else if (inject->sched_stat) {
@@ -2075,7 +2079,7 @@ static int __cmd_inject(struct perf_inject *inject)
 	if (!inject->itrace_synth_opts.set)
 		auxtrace_index__free(&session->auxtrace_index);
 
-	if (!inject->is_pipe && !inject->in_place_update)
+	if (!inject->output.is_pipe && !inject->in_place_update)
 		lseek(fd, output_data_offset, SEEK_SET);
 
 	ret = perf_session__process_events(session);
@@ -2094,7 +2098,7 @@ static int __cmd_inject(struct perf_inject *inject)
 		}
 	}
 
-	if (!inject->is_pipe && !inject->in_place_update) {
+	if (!inject->output.is_pipe && !inject->in_place_update) {
 		struct inject_fc inj_fc = {
 			.fc.copy = feat_copy_cb,
 			.inject = inject,
@@ -2124,7 +2128,8 @@ static int __cmd_inject(struct perf_inject *inject)
 		}
 		session->header.data_offset = output_data_offset;
 		session->header.data_size = inject->bytes_written;
-		perf_session__inject_header(session, session->evlist, fd, &inj_fc.fc);
+		perf_session__inject_header(session, session->evlist, fd, &inj_fc.fc,
+					    write_attrs_after_data);
 
 		if (inject->copy_kcore_dir) {
 			ret = copy_kcore_dir(inject);
@@ -2161,7 +2166,6 @@ int cmd_inject(int argc, const char **argv)
 		.use_stdio = true,
 	};
 	int ret;
-	bool repipe = true;
 	const char *known_build_ids = NULL;
 	bool build_ids;
 	bool build_id_all;
@@ -2273,16 +2277,7 @@ int cmd_inject(int argc, const char **argv)
 		inject.build_id_style = BID_RWS__INJECT_HEADER_ALL;
 
 	data.path = inject.input_name;
-	if (!strcmp(inject.input_name, "-") || inject.output.is_pipe) {
-		inject.is_pipe = true;
-		/*
-		 * Do not repipe header when input is a regular file
-		 * since either it can rewrite the header at the end
-		 * or write a new pipe header.
-		 */
-		if (strcmp(inject.input_name, "-"))
-			repipe = false;
-	}
+
 	ordered_events = inject.jit_mode || inject.sched_stat ||
 		(inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY);
 	perf_tool__init(&inject.tool, ordered_events);
@@ -2325,9 +2320,9 @@ int cmd_inject(int argc, const char **argv)
 	inject.tool.compressed		= perf_event__repipe_op4_synth;
 	inject.tool.auxtrace		= perf_event__repipe_auxtrace;
 	inject.tool.dont_split_sample_group = true;
-	inject.session = __perf_session__new(&data, repipe,
-					     output_fd(&inject),
-					     &inject.tool);
+	inject.session = __perf_session__new(&data, &inject.tool,
+					     /*trace_event_repipe=*/inject.output.is_pipe);
+
 	if (IS_ERR(inject.session)) {
 		ret = PTR_ERR(inject.session);
 		goto out_close_output;
@@ -2341,19 +2336,26 @@ int cmd_inject(int argc, const char **argv)
 	if (ret)
 		goto out_delete;
 
-	if (!data.is_pipe && inject.output.is_pipe) {
+	if (inject.output.is_pipe) {
 		ret = perf_header__write_pipe(perf_data__fd(&inject.output));
 		if (ret < 0) {
 			pr_err("Couldn't write a new pipe header.\n");
 			goto out_delete;
 		}
 
-		ret = perf_event__synthesize_for_pipe(&inject.tool,
-						      inject.session,
-						      &inject.output,
-						      perf_event__repipe);
-		if (ret < 0)
-			goto out_delete;
+		/*
+		 * If the input is already a pipe then the features and
+		 * attributes don't need synthesizing, they will be present in
+		 * the input.
+		 */
+		if (!data.is_pipe) {
+			ret = perf_event__synthesize_for_pipe(&inject.tool,
+							      inject.session,
+							      &inject.output,
+							      perf_event__repipe);
+			if (ret < 0)
+				goto out_delete;
+		}
 	}
 
 	if (inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4eb39463067e..59e2f37c1cb4 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3818,10 +3818,11 @@ size_t perf_session__data_offset(const struct evlist *evlist)
 int perf_session__inject_header(struct perf_session *session,
 				struct evlist *evlist,
 				int fd,
-				struct feat_copier *fc)
+				struct feat_copier *fc,
+				bool write_attrs_after_data)
 {
 	return perf_session__do_write_header(session, evlist, fd, true, fc,
-					     /*write_attrs_after_data=*/false);
+					     write_attrs_after_data);
 }
 
 static int perf_header__getbuffer64(struct perf_header *header,
@@ -4145,7 +4146,7 @@ static int perf_header__read_pipe(struct perf_session *session, int repipe_fd)
 	struct perf_pipe_file_header f_header;
 
 	if (perf_file_header__read_pipe(&f_header, header, session->data,
-					session->repipe, repipe_fd) < 0) {
+					/*repipe=*/false, repipe_fd) < 0) {
 		pr_debug("incompatible file format\n");
 		return -EINVAL;
 	}
@@ -4560,15 +4561,14 @@ int perf_event__process_tracing_data(struct perf_session *session,
 		      SEEK_SET);
 	}
 
-	size_read = trace_report(fd, &session->tevent,
-				 session->repipe);
+	size_read = trace_report(fd, &session->tevent, session->trace_event_repipe);
 	padding = PERF_ALIGN(size_read, sizeof(u64)) - size_read;
 
 	if (readn(fd, buf, padding) < 0) {
 		pr_err("%s: reading input file", __func__);
 		return -1;
 	}
-	if (session->repipe) {
+	if (session->trace_event_repipe) {
 		int retw = write(STDOUT_FILENO, buf, padding);
 		if (retw <= 0 || retw != padding) {
 			pr_err("%s: repiping tracing data padding", __func__);
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 3285981948d7..7137509cf6d8 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -150,7 +150,8 @@ struct feat_copier {
 int perf_session__inject_header(struct perf_session *session,
 				struct evlist *evlist,
 				int fd,
-				struct feat_copier *fc);
+				struct feat_copier *fc,
+				bool write_attrs_after_data);
 
 size_t perf_session__data_offset(const struct evlist *evlist);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 774eb3382000..5a51db036ee8 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -135,8 +135,8 @@ static int ordered_events__deliver_event(struct ordered_events *oe,
 }
 
 struct perf_session *__perf_session__new(struct perf_data *data,
-					 bool repipe, int repipe_fd,
-					 struct perf_tool *tool)
+					 struct perf_tool *tool,
+					 bool trace_event_repipe)
 {
 	int ret = -ENOMEM;
 	struct perf_session *session = zalloc(sizeof(*session));
@@ -144,7 +144,7 @@ struct perf_session *__perf_session__new(struct perf_data *data,
 	if (!session)
 		goto out;
 
-	session->repipe = repipe;
+	session->trace_event_repipe = trace_event_repipe;
 	session->tool   = tool;
 	session->decomp_data.zstd_decomp = &session->zstd_data;
 	session->active_decomp = &session->decomp_data;
@@ -162,7 +162,7 @@ struct perf_session *__perf_session__new(struct perf_data *data,
 		session->data = data;
 
 		if (perf_data__is_read(data)) {
-			ret = perf_session__open(session, repipe_fd);
+			ret = perf_session__open(session, /*repipe_fd=*/-1);
 			if (ret < 0)
 				goto out_delete;
 
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index e56518639711..bcf1bcf06959 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -59,12 +59,8 @@ struct perf_session {
 #endif
 	/** @time_conv: Holds contents of last PERF_RECORD_TIME_CONV event. */
 	struct perf_record_time_conv	time_conv;
-	/**
-	 * @repipe: When set causes certain reading (header and trace events) to
-	 * also write events. The written file descriptor must be provided for
-	 * the header but is implicitly stdout for trace events.
-	 */
-	bool			repipe;
+	/** @trace_event_repipe: When set causes read trace events to be written to stdout. */
+	bool			trace_event_repipe;
 	/**
 	 * @one_mmap: The reader will use a single mmap by default. There may be
 	 * multiple data files in particular for aux events. If this is true
@@ -110,13 +106,13 @@ struct decomp {
 struct perf_tool;
 
 struct perf_session *__perf_session__new(struct perf_data *data,
-					 bool repipe, int repipe_fd,
-					 struct perf_tool *tool);
+					 struct perf_tool *tool,
+					 bool trace_event_repipe);
 
 static inline struct perf_session *perf_session__new(struct perf_data *data,
 						     struct perf_tool *tool)
 {
-	return __perf_session__new(data, false, -1, tool);
+	return __perf_session__new(data, tool, /*trace_event_repipe=*/false);
 }
 
 void perf_session__delete(struct perf_session *session);
-- 
cgit v1.2.3


From 2d57c32b32fb84c6f6b2342bca85c9ba2b2d759b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 29 Aug 2024 08:01:53 -0700
Subject: perf header: Remove repipe option

No longer used by `perf inject` the repipe_fd is always -1 and repipe
is always false. Remove the options and associated code knowing the
constant values of the removed variables.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240829150154.37929-8-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c  | 19 +++++--------------
 tools/perf/util/header.h  |  2 +-
 tools/perf/util/session.c |  6 +++---
 3 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 59e2f37c1cb4..a6386d12afd7 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -4113,13 +4113,8 @@ static int perf_file_section__process(struct perf_file_section *section,
 
 static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
 				       struct perf_header *ph,
-				       struct perf_data* data,
-				       bool repipe, int repipe_fd)
+				       struct perf_data *data)
 {
-	struct feat_fd ff = {
-		.fd = repipe_fd,
-		.ph = ph,
-	};
 	ssize_t ret;
 
 	ret = perf_data__read(data, header, sizeof(*header));
@@ -4134,19 +4129,15 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
 	if (ph->needs_swap)
 		header->size = bswap_64(header->size);
 
-	if (repipe && do_write(&ff, header, sizeof(*header)) < 0)
-		return -1;
-
 	return 0;
 }
 
-static int perf_header__read_pipe(struct perf_session *session, int repipe_fd)
+static int perf_header__read_pipe(struct perf_session *session)
 {
 	struct perf_header *header = &session->header;
 	struct perf_pipe_file_header f_header;
 
-	if (perf_file_header__read_pipe(&f_header, header, session->data,
-					/*repipe=*/false, repipe_fd) < 0) {
+	if (perf_file_header__read_pipe(&f_header, header, session->data) < 0) {
 		pr_debug("incompatible file format\n");
 		return -EINVAL;
 	}
@@ -4246,7 +4237,7 @@ static int evlist__prepare_tracepoint_events(struct evlist *evlist, struct tep_h
 }
 #endif
 
-int perf_session__read_header(struct perf_session *session, int repipe_fd)
+int perf_session__read_header(struct perf_session *session)
 {
 	struct perf_data *data = session->data;
 	struct perf_header *header = &session->header;
@@ -4267,7 +4258,7 @@ int perf_session__read_header(struct perf_session *session, int repipe_fd)
 	 * We can read 'pipe' data event from regular file,
 	 * check for the pipe header regardless of source.
 	 */
-	err = perf_header__read_pipe(session, repipe_fd);
+	err = perf_header__read_pipe(session);
 	if (!err || perf_data__is_pipe(data)) {
 		data->is_pipe = true;
 		return err;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 7137509cf6d8..a63a361f20f4 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -131,7 +131,7 @@ union perf_event;
 
 extern const char perf_version_string[];
 
-int perf_session__read_header(struct perf_session *session, int repipe_fd);
+int perf_session__read_header(struct perf_session *session);
 int perf_session__write_header(struct perf_session *session,
 			       struct evlist *evlist,
 			       int fd, bool at_exit);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5a51db036ee8..b492300ec959 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -45,11 +45,11 @@ static int perf_session__deliver_event(struct perf_session *session,
 				       u64 file_offset,
 				       const char *file_path);
 
-static int perf_session__open(struct perf_session *session, int repipe_fd)
+static int perf_session__open(struct perf_session *session)
 {
 	struct perf_data *data = session->data;
 
-	if (perf_session__read_header(session, repipe_fd) < 0) {
+	if (perf_session__read_header(session) < 0) {
 		pr_err("incompatible file format (rerun with -v to learn more)\n");
 		return -1;
 	}
@@ -162,7 +162,7 @@ struct perf_session *__perf_session__new(struct perf_data *data,
 		session->data = data;
 
 		if (perf_data__is_read(data)) {
-			ret = perf_session__open(session, /*repipe_fd=*/-1);
+			ret = perf_session__open(session);
 			if (ret < 0)
 				goto out_delete;
 
-- 
cgit v1.2.3


From ccb9004656e55ea13e11e93862aa32aa575bf802 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 29 Aug 2024 08:01:54 -0700
Subject: perf test: Additional pipe tests with pipe output written to a file

Additional pipe tests where piped files are written to disk. This
means that spotting a file name of "-" isn't a sufficient "is pipe?"
test.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240829150154.37929-9-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/pipe_test.sh | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/pipe_test.sh b/tools/perf/tests/shell/pipe_test.sh
index ad10012fdc29..a3c94b4182c2 100755
--- a/tools/perf/tests/shell/pipe_test.sh
+++ b/tools/perf/tests/shell/pipe_test.sh
@@ -11,6 +11,7 @@ sym="noploop"
 skip_test_missing_symbol ${sym}
 
 data=$(mktemp /tmp/perf.data.XXXXXX)
+data2=$(mktemp /tmp/perf.data2.XXXXXX)
 prog="perf test -w noploop"
 err=0
 
@@ -19,6 +20,8 @@ set -e
 cleanup() {
   rm -rf "${data}"
   rm -rf "${data}".old
+  rm -rf "${data2}"
+  rm -rf "${data2}".old
 
   trap - EXIT TERM INT
 }
@@ -49,6 +52,14 @@ test_record_report() {
     return
   fi
 
+  perf record -g -e task-clock:u -o - ${prog} > ${data}
+  if ! perf report -i ${data} --task | grep -q ${task}
+  then
+    echo "Record+report pipe test [Failed - cannot find the test file in the perf report #3]"
+    err=1
+    return
+  fi
+
   echo "Record+report pipe test [Success]"
 }
 
@@ -86,6 +97,21 @@ test_inject_bids() {
     return
   fi
 
+  perf record -e task-clock:u -o - ${prog} > ${data}
+  if ! perf inject ${inject_opt} -i ${data} | perf report -i - | grep -q ${sym}; then
+    echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #5]"
+    err=1
+    return
+  fi
+
+  perf record -e task-clock:u -o - ${prog} > ${data}
+  perf inject ${inject_opt} -i ${data} -o ${data2}
+  if ! perf report -i ${data2} | grep -q ${sym}; then
+    echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #6]"
+    err=1
+    return
+  fi
+
   echo "Inject ${inject_opt} build-ids test [Success]"
 }
 
-- 
cgit v1.2.3


From 10d6c57c824e21b4cf8eda9491caea06edac9fd7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 29 Aug 2024 23:51:48 -0700
Subject: perf lock contention: Handle error in a single place

It has some duplicate codes to do the same job.  Let's add a label and
goto there to handle errors in a single place.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240830065150.1758962-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/lock_contention.bpf.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index d931a898c434..e8a6f6463019 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -439,11 +439,8 @@ int contention_end(u64 *ctx)
 
 	duration = bpf_ktime_get_ns() - pelem->timestamp;
 	if ((__s64)duration < 0) {
-		pelem->lock = 0;
-		if (need_delete)
-			bpf_map_delete_elem(&tstamp, &pid);
 		__sync_fetch_and_add(&time_fail, 1);
-		return 0;
+		goto out;
 	}
 
 	switch (aggr_mode) {
@@ -477,11 +474,8 @@ int contention_end(u64 *ctx)
 	data = bpf_map_lookup_elem(&lock_stat, &key);
 	if (!data) {
 		if (data_map_full) {
-			pelem->lock = 0;
-			if (need_delete)
-				bpf_map_delete_elem(&tstamp, &pid);
 			__sync_fetch_and_add(&data_fail, 1);
-			return 0;
+			goto out;
 		}
 
 		struct contention_data first = {
@@ -502,10 +496,7 @@ int contention_end(u64 *ctx)
 				data_map_full = 1;
 			__sync_fetch_and_add(&data_fail, 1);
 		}
-		pelem->lock = 0;
-		if (need_delete)
-			bpf_map_delete_elem(&tstamp, &pid);
-		return 0;
+		goto out;
 	}
 
 	__sync_fetch_and_add(&data->total_time, duration);
@@ -517,6 +508,7 @@ int contention_end(u64 *ctx)
 	if (data->min_time > duration)
 		data->min_time = duration;
 
+out:
 	pelem->lock = 0;
 	if (need_delete)
 		bpf_map_delete_elem(&tstamp, &pid);
-- 
cgit v1.2.3


From 05a5dd1dfd8fdeee6498875c2d78d4385d627bd6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 29 Aug 2024 23:51:49 -0700
Subject: perf lock contention: Simplify spinlock check

The LCB_F_SPIN bit is used for spinlock, rwlock and optimistic spinning
in mutex.  In get_tstamp_elem() it needs to check spinlock and rwlock
only.  As mutex sets the LCB_F_MUTEX, it can check those two bits and
reduce the number of operations.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240830065150.1758962-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/lock_contention.bpf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index e8a6f6463019..4b7237e178bd 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -323,8 +323,7 @@ static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
 	struct tstamp_data *pelem;
 
 	/* Use per-cpu array map for spinlock and rwlock */
-	if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN ||
-	    flags == (LCB_F_SPIN | LCB_F_WRITE)) {
+	if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) {
 		__u32 idx = 0;
 
 		pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
-- 
cgit v1.2.3


From 36cddd105666d45a2cfb70a04c27b0c53bb383cd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 29 Aug 2024 23:51:50 -0700
Subject: perf lock contention: Do not fail EEXIST for update

When it updates the lock stat for the first time, it needs to create an
element in the BPF hash map.

But if there's a concurrent thread waiting for the same lock (like for
rwsem or rwlock), it might race with the thread and possibly fail to
update with -EEXIST.

In that case, it can lookup the map again and put the data there instead
of failing.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240830065150.1758962-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/lock_contention.bpf.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 4b7237e178bd..52a876b42699 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -491,6 +491,12 @@ int contention_end(u64 *ctx)
 
 		err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
 		if (err < 0) {
+			if (err == -EEXIST) {
+				/* it lost the race, try to get it again */
+				data = bpf_map_lookup_elem(&lock_stat, &key);
+				if (data != NULL)
+					goto found;
+			}
 			if (err == -E2BIG)
 				data_map_full = 1;
 			__sync_fetch_and_add(&data_fail, 1);
@@ -498,6 +504,7 @@ int contention_end(u64 *ctx)
 		goto out;
 	}
 
+found:
 	__sync_fetch_and_add(&data->total_time, duration);
 	__sync_fetch_and_add(&data->count, 1);
 
-- 
cgit v1.2.3


From 74fd69a35cae1d53d9c95b700aae312d77b0d1cb Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 27 Aug 2024 22:29:53 -0700
Subject: perf lock contention: Fix spinlock and rwlock accounting

The spinlock and rwlock use a single-element per-cpu array to track
current locks due to performance reason.  But this means the key is
always available and it cannot simply account lock stats in the array
because some of them are invalid.

In fact, the contention_end() program in the BPF invalidates the entry
by setting the 'lock' value to 0 instead of deleting the entry for the
hashmap.  So it should skip entries with the lock value of 0 in the
account_end_timestamp().

Otherwise, it'd have spurious high contention on an idle machine:

  $ sudo perf lock con -ab -Y spinlock sleep 3
   contended   total wait     max wait     avg wait         type   caller

           8      4.72 s       1.84 s     590.46 ms     spinlock   rcu_core+0xc7
           8      1.87 s       1.87 s     233.48 ms     spinlock   process_one_work+0x1b5
           2      1.87 s       1.87 s     933.92 ms     spinlock   worker_thread+0x1a2
           3      1.81 s       1.81 s     603.93 ms     spinlock   tmigr_update_events+0x13c
           2      1.72 s       1.72 s     861.98 ms     spinlock   tick_do_update_jiffies64+0x25
           6     42.48 us     13.02 us      7.08 us     spinlock   futex_q_lock+0x2a
           1     13.03 us     13.03 us     13.03 us     spinlock   futex_wake+0xce
           1     11.61 us     11.61 us     11.61 us     spinlock   rcu_core+0xc7

I don't believe it has contention on a spinlock longer than 1 second.
After this change, it only reports some small contentions.

  $ sudo perf lock con -ab -Y spinlock sleep 3
   contended   total wait     max wait     avg wait         type   caller

           4    133.51 us     43.29 us     33.38 us     spinlock   tick_do_update_jiffies64+0x25
           4     69.06 us     31.82 us     17.27 us     spinlock   process_one_work+0x1b5
           2     50.66 us     25.77 us     25.33 us     spinlock   rcu_core+0xc7
           1     28.45 us     28.45 us     28.45 us     spinlock   rcu_core+0xc7
           1     24.77 us     24.77 us     24.77 us     spinlock   tmigr_update_events+0x13c
           1     23.34 us     23.34 us     23.34 us     spinlock   raw_spin_rq_lock_nested+0x15

Fixes: b5711042a1c8cc88 ("perf lock contention: Use per-cpu array map for spinlocks")
Reported-by: Xi Wang <xii@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: bpf@vger.kernel.org
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240828052953.1445862-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_lock_contention.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index b4cb3fe5cc25..bc4e92c0c08b 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -286,6 +286,9 @@ static void account_end_timestamp(struct lock_contention *con)
 			goto next;
 
 		for (int i = 0; i < total_cpus; i++) {
+			if (cpu_data[i].lock == 0)
+				continue;
+
 			update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
 					 &cpu_data[i]);
 		}
-- 
cgit v1.2.3


From 39c243411bdb8fb35777adf49ee32549633c4e12 Mon Sep 17 00:00:00 2001
From: Yang Jihong <yangjihong@bytedance.com>
Date: Mon, 19 Aug 2024 10:47:20 +0800
Subject: perf sched timehist: Fixed timestamp error when unable to confirm
 event sched_in time

If sched_in event for current task is not recorded, sched_in timestamp
will be set to end_time of time window interest, causing an error in
timestamp show. In this case, we choose to ignore this event.

Test scenario:

  perf[1229608] does not record the first sched_in event, run time and sch delay are both 0

  # perf sched timehist
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       wait time  sch delay   run time
                          [tid/pid]                          (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  ---------  ---------  ---------
   2090450.763231 [0000]  perf[1229608]                       0.000      0.000      0.000
   2090450.763235 [0000]  migration/0[15]                     0.000      0.001      0.003
   2090450.763263 [0001]  perf[1229608]                       0.000      0.000      0.000
   2090450.763268 [0001]  migration/1[21]                     0.000      0.001      0.004
   2090450.763302 [0002]  perf[1229608]                       0.000      0.000      0.000
   2090450.763309 [0002]  migration/2[27]                     0.000      0.001      0.007
   2090450.763338 [0003]  perf[1229608]                       0.000      0.000      0.000
   2090450.763343 [0003]  migration/3[33]                     0.000      0.001      0.004

Before:

  arbitrarily specify a time window of interest, timestamp will be set to an incorrect value

  # perf sched timehist --time 100,200
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       wait time  sch delay   run time
                          [tid/pid]                          (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  ---------  ---------  ---------
       200.000000 [0000]  perf[1229608]                       0.000      0.000      0.000
       200.000000 [0001]  perf[1229608]                       0.000      0.000      0.000
       200.000000 [0002]  perf[1229608]                       0.000      0.000      0.000
       200.000000 [0003]  perf[1229608]                       0.000      0.000      0.000
       200.000000 [0004]  perf[1229608]                       0.000      0.000      0.000
       200.000000 [0005]  perf[1229608]                       0.000      0.000      0.000
       200.000000 [0006]  perf[1229608]                       0.000      0.000      0.000
       200.000000 [0007]  perf[1229608]                       0.000      0.000      0.000

 After:

  # perf sched timehist --time 100,200
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       wait time  sch delay   run time
                          [tid/pid]                          (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  ---------  ---------  ---------

Fixes: 853b74071110bed3 ("perf sched timehist: Add option to specify time window of interest")
Signed-off-by: Yang Jihong <yangjihong@bytedance.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240819024720.2405244-1-yangjihong@bytedance.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 9c1276dc8ef6..1c386ebe4b98 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -2683,9 +2683,12 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
 	 * - previous sched event is out of window - we are done
 	 * - sample time is beyond window user cares about - reset it
 	 *   to close out stats for time window interest
+	 * - If tprev is 0, that is, sched_in event for current task is
+	 *   not recorded, cannot determine whether sched_in event is
+	 *   within time window interest - ignore it
 	 */
 	if (ptime->end) {
-		if (tprev > ptime->end)
+		if (!tprev || tprev > ptime->end)
 			goto out;
 
 		if (t > ptime->end)
-- 
cgit v1.2.3


From 01b52f01c5a6bdc3b3e4229dccc84ed667e6867b Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 29 Jul 2024 18:02:11 +0200
Subject: selftests: vDSO: simplify getrandom thread local storage and structs

Rather than using pthread_get/set_specific, just use gcc's __thread
annotation, which is noticeably faster and makes the code more obvious.

Also, just have one simplified struct called vgrnd, instead of trying to
split things up semantically. Those divisions were useful when this code
was split across several commit *messages*, but doesn't make as much
sense within a single file. This should make the code more clear and
provide a better example for implementers.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_test_getrandom.c | 67 +++++++++-------------
 1 file changed, 27 insertions(+), 40 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
index 05122425a873..89c961175956 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
@@ -38,50 +38,43 @@ static struct {
 	pthread_mutex_t lock;
 	void **states;
 	size_t len, cap;
-} grnd_allocator = {
-	.lock = PTHREAD_MUTEX_INITIALIZER
-};
-
-static struct {
 	ssize_t(*fn)(void *, size_t, unsigned long, void *, size_t);
-	pthread_key_t key;
-	pthread_once_t initialized;
 	struct vgetrandom_opaque_params params;
-} grnd_ctx = {
-	.initialized = PTHREAD_ONCE_INIT
+} vgrnd = {
+	.lock = PTHREAD_MUTEX_INITIALIZER
 };
 
 static void *vgetrandom_get_state(void)
 {
 	void *state = NULL;
 
-	pthread_mutex_lock(&grnd_allocator.lock);
-	if (!grnd_allocator.len) {
+	pthread_mutex_lock(&vgrnd.lock);
+	if (!vgrnd.len) {
 		size_t page_size = getpagesize();
 		size_t new_cap;
 		size_t alloc_size, num = sysconf(_SC_NPROCESSORS_ONLN); /* Just a decent heuristic. */
 		void *new_block, *new_states;
 
-		alloc_size = (num * grnd_ctx.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1));
-		num = (page_size / grnd_ctx.params.size_of_opaque_state) * (alloc_size / page_size);
-		new_block = mmap(0, alloc_size, grnd_ctx.params.mmap_prot, grnd_ctx.params.mmap_flags, -1, 0);
+		alloc_size = (num * vgrnd.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1));
+		num = (page_size / vgrnd.params.size_of_opaque_state) * (alloc_size / page_size);
+		new_block = mmap(0, alloc_size, vgrnd.params.mmap_prot, vgrnd.params.mmap_flags, -1, 0);
 		if (new_block == MAP_FAILED)
 			goto out;
 
-		new_cap = grnd_allocator.cap + num;
-		new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states));
+		new_cap = vgrnd.cap + num;
+		new_states = reallocarray(vgrnd.states, new_cap, sizeof(*vgrnd.states));
 		if (!new_states)
 			goto unmap;
-		grnd_allocator.cap = new_cap;
-		grnd_allocator.states = new_states;
+		vgrnd.cap = new_cap;
+		vgrnd.states = new_states;
 
 		for (size_t i = 0; i < num; ++i) {
-			if (((uintptr_t)new_block & (page_size - 1)) + grnd_ctx.params.size_of_opaque_state > page_size)
+			if (((uintptr_t)new_block & (page_size - 1)) + vgrnd.params.size_of_opaque_state > page_size)
 				new_block = (void *)(((uintptr_t)new_block + page_size - 1) & (~(page_size - 1)));
-			grnd_allocator.states[i] = new_block;
-			new_block += grnd_ctx.params.size_of_opaque_state;
+			vgrnd.states[i] = new_block;
+			new_block += vgrnd.params.size_of_opaque_state;
 		}
-		grnd_allocator.len = num;
+		vgrnd.len = num;
 		goto success;
 
 	unmap:
@@ -89,10 +82,10 @@ static void *vgetrandom_get_state(void)
 		goto out;
 	}
 success:
-	state = grnd_allocator.states[--grnd_allocator.len];
+	state = vgrnd.states[--vgrnd.len];
 
 out:
-	pthread_mutex_unlock(&grnd_allocator.lock);
+	pthread_mutex_unlock(&vgrnd.lock);
 	return state;
 }
 
@@ -100,27 +93,25 @@ static void vgetrandom_put_state(void *state)
 {
 	if (!state)
 		return;
-	pthread_mutex_lock(&grnd_allocator.lock);
-	grnd_allocator.states[grnd_allocator.len++] = state;
-	pthread_mutex_unlock(&grnd_allocator.lock);
+	pthread_mutex_lock(&vgrnd.lock);
+	vgrnd.states[vgrnd.len++] = state;
+	pthread_mutex_unlock(&vgrnd.lock);
 }
 
 static void vgetrandom_init(void)
 {
-	if (pthread_key_create(&grnd_ctx.key, vgetrandom_put_state) != 0)
-		return;
 	unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
 	if (!sysinfo_ehdr) {
 		printf("AT_SYSINFO_EHDR is not present!\n");
 		exit(KSFT_SKIP);
 	}
 	vdso_init_from_sysinfo_ehdr(sysinfo_ehdr);
-	grnd_ctx.fn = (__typeof__(grnd_ctx.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom");
-	if (!grnd_ctx.fn) {
+	vgrnd.fn = (__typeof__(vgrnd.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom");
+	if (!vgrnd.fn) {
 		printf("__vdso_getrandom is missing!\n");
 		exit(KSFT_FAIL);
 	}
-	if (grnd_ctx.fn(NULL, 0, 0, &grnd_ctx.params, ~0UL) != 0) {
+	if (vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL) != 0) {
 		printf("failed to fetch vgetrandom params!\n");
 		exit(KSFT_FAIL);
 	}
@@ -128,22 +119,16 @@ static void vgetrandom_init(void)
 
 static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags)
 {
-	void *state;
+	static __thread void *state;
 
-	pthread_once(&grnd_ctx.initialized, vgetrandom_init);
-	state = pthread_getspecific(grnd_ctx.key);
 	if (!state) {
 		state = vgetrandom_get_state();
-		if (pthread_setspecific(grnd_ctx.key, state) != 0) {
-			vgetrandom_put_state(state);
-			state = NULL;
-		}
 		if (!state) {
 			printf("vgetrandom_get_state failed!\n");
 			exit(KSFT_FAIL);
 		}
 	}
-	return grnd_ctx.fn(buf, len, flags, state, grnd_ctx.params.size_of_opaque_state);
+	return vgrnd.fn(buf, len, flags, state, vgrnd.params.size_of_opaque_state);
 }
 
 enum { TRIALS = 25000000, THREADS = 256 };
@@ -265,6 +250,8 @@ static void usage(const char *argv0)
 
 int main(int argc, char *argv[])
 {
+	vgetrandom_init();
+
 	if (argc == 1) {
 		kselftest();
 		return 0;
-- 
cgit v1.2.3


From 20a9af057cd7c4057c72c803dcf83c97eb5deb95 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Thu, 22 Aug 2024 09:13:21 +0200
Subject: selftests: vDSO: don't hard-code location of vDSO sources

Architectures use different location for vDSO sources:

	arch/mips/vdso
	arch/sparc/vdso
	arch/arm64/kernel/vdso
	arch/riscv/kernel/vdso
	arch/csky/kernel/vdso
	arch/x86/um/vdso
	arch/x86/entry/vdso
	arch/powerpc/kernel/vdso
	arch/arm/vdso
	arch/loongarch/vdso

Don't hard-code vdso sources location in selftest Makefile. Instead
create a vdso/ symbolic link in tools/arch/$arch/ and update Makefile
accordingly.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/arch/x86/vdso                   | 1 +
 tools/testing/selftests/vDSO/Makefile | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 120000 tools/arch/x86/vdso

(limited to 'tools')

diff --git a/tools/arch/x86/vdso b/tools/arch/x86/vdso
new file mode 120000
index 000000000000..7eb962fd3454
--- /dev/null
+++ b/tools/arch/x86/vdso
@@ -0,0 +1 @@
+../../../arch/x86/entry/vdso/
\ No newline at end of file
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 3de8e7e052ae..c9a819cacbf2 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -40,7 +40,7 @@ $(OUTPUT)/vdso_test_getrandom: parse_vdso.c
 $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
                                          -isystem $(top_srcdir)/include/uapi
 
-$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S
+$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
                                       -isystem $(top_srcdir)/arch/$(ARCH)/include \
                                       -isystem $(top_srcdir)/include \
-- 
cgit v1.2.3


From f8d92fc527ff0388b2e94c101afb0e5c7496199f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Thu, 22 Aug 2024 09:13:23 +0200
Subject: selftests: vDSO: fix include order in build of test_vdso_chacha

Building test_vdso_chacha currently leads to following issue:

  In file included from /home/chleroy/linux-powerpc/include/linux/limits.h:7,
                   from /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/local_lim.h:38,
                   from /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/posix1_lim.h:161,
                   from /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/limits.h:195,
                   from /opt/powerpc64-e5500--glibc--stable-2024.02-1/lib/gcc/powerpc64-buildroot-linux-gnu/12.3.0/include-fixed/limits.h:203,
                   from /opt/powerpc64-e5500--glibc--stable-2024.02-1/lib/gcc/powerpc64-buildroot-linux-gnu/12.3.0/include-fixed/syslimits.h:7,
                   from /opt/powerpc64-e5500--glibc--stable-2024.02-1/lib/gcc/powerpc64-buildroot-linux-gnu/12.3.0/include-fixed/limits.h:34,
                   from /tmp/sodium/usr/local/include/sodium/export.h:7,
                   from /tmp/sodium/usr/local/include/sodium/crypto_stream_chacha20.h:14,
                   from vdso_test_chacha.c:6:
  /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/xopen_lim.h:99:6: error: missing binary operator before token "("
     99 | # if INT_MAX == 32767
        |      ^~~~~~~
  /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/xopen_lim.h:102:7: error: missing binary operator before token "("
    102 | #  if INT_MAX == 2147483647
        |       ^~~~~~~
  /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/xopen_lim.h:126:6: error: missing binary operator before token "("
    126 | # if LONG_MAX == 2147483647
        |      ^~~~~~~~

This is due to kernel include/linux/limits.h being included instead of
libc's limits.h.

This is because directory include/ is added through option -isystem so
it goes prior to glibc's include directory.

Replace -isystem by -idirafter.

But this implies that now tools/include/linux/linkage.h is included
instead of include/linux/linkage.h, so define a stub for
SYM_FUNC_START() and SYM_FUNC_END().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/include/linux/linkage.h         | 4 ++++
 tools/testing/selftests/vDSO/Makefile | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h
index bc763d500262..a48ff086899c 100644
--- a/tools/include/linux/linkage.h
+++ b/tools/include/linux/linkage.h
@@ -1,4 +1,8 @@
 #ifndef _TOOLS_INCLUDE_LINUX_LINKAGE_H
 #define _TOOLS_INCLUDE_LINUX_LINKAGE_H
 
+#define SYM_FUNC_START(x) .globl x; x:
+
+#define SYM_FUNC_END(x)
+
 #endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index c9a819cacbf2..10ffdda3f2fa 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -42,7 +42,7 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
 
 $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
-                                      -isystem $(top_srcdir)/arch/$(ARCH)/include \
-                                      -isystem $(top_srcdir)/include \
+                                      -idirafter $(top_srcdir)/arch/$(ARCH)/include \
+                                      -idirafter $(top_srcdir)/include \
                                       -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \
                                       -Wa,--noexecstack $(SODIUM)
-- 
cgit v1.2.3


From e1bbcab496f745d963e43a6e0f669359e82c4934 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Thu, 22 Aug 2024 09:13:22 +0200
Subject: selftests: vDSO: look for arch-specific function name in getrandom
 test

Don't hard-code x86 specific names. Rather, use vdso_config definitions
to find the correct function matching the architecture.

Add random VDSO function names in names[][]. Remove the #ifdef
CONFIG_VDSO32, as having the name there all the time is harmless and
guaranties a steady index for following strings.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[Jason: add [6] to variable declaration rather than each usage site.]
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_config.h         | 8 +++-----
 tools/testing/selftests/vDSO/vdso_test_getrandom.c | 8 ++++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h
index 7b543e7f04d7..b5ba5ded3a61 100644
--- a/tools/testing/selftests/vDSO/vdso_config.h
+++ b/tools/testing/selftests/vDSO/vdso_config.h
@@ -68,16 +68,15 @@ static const char *versions[7] = {
 	"LINUX_5.10"
 };
 
-static const char *names[2][6] = {
+static const char *names[2][7] = {
 	{
 		"__kernel_gettimeofday",
 		"__kernel_clock_gettime",
 		"__kernel_time",
 		"__kernel_clock_getres",
 		"__kernel_getcpu",
-#if defined(VDSO_32BIT)
 		"__kernel_clock_gettime64",
-#endif
+		"__kernel_getrandom",
 	},
 	{
 		"__vdso_gettimeofday",
@@ -85,9 +84,8 @@ static const char *names[2][6] = {
 		"__vdso_time",
 		"__vdso_clock_getres",
 		"__vdso_getcpu",
-#if defined(VDSO_32BIT)
 		"__vdso_clock_gettime64",
-#endif
+		"__vdso_getrandom",
 	},
 };
 
diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
index 89c961175956..20bbef992c48 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
@@ -21,6 +21,7 @@
 
 #include "../kselftest.h"
 #include "parse_vdso.h"
+#include "vdso_config.h"
 
 #ifndef timespecsub
 #define	timespecsub(tsp, usp, vsp)					\
@@ -100,15 +101,18 @@ static void vgetrandom_put_state(void *state)
 
 static void vgetrandom_init(void)
 {
+	const char *version = versions[VDSO_VERSION];
+	const char *name = names[VDSO_NAMES][6];
 	unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
+
 	if (!sysinfo_ehdr) {
 		printf("AT_SYSINFO_EHDR is not present!\n");
 		exit(KSFT_SKIP);
 	}
 	vdso_init_from_sysinfo_ehdr(sysinfo_ehdr);
-	vgrnd.fn = (__typeof__(vgrnd.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom");
+	vgrnd.fn = (__typeof__(vgrnd.fn))vdso_sym(version, name);
 	if (!vgrnd.fn) {
-		printf("__vdso_getrandom is missing!\n");
+		printf("%s is missing!\n", name);
 		exit(KSFT_FAIL);
 	}
 	if (vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL) != 0) {
-- 
cgit v1.2.3


From 1e661b349041c3d5026d9e95930b1d396c0c6e6d Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Tue, 27 Aug 2024 21:20:15 +0800
Subject: selftests: vDSO: add --cflags for pkg-config command querying
 libsodium

When libsodium is installed into its own prefix, the --cflags output is
needed for the compiler to find libsodium headers.

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 10ffdda3f2fa..180854eb9fec 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 uname_M := $(shell uname -m 2>/dev/null || echo not)
 ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
-SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null)
+SODIUM := $(shell pkg-config --libs --cflags libsodium 2>/dev/null)
 
 TEST_GEN_PROGS := vdso_test_gettimeofday
 TEST_GEN_PROGS += vdso_test_getcpu
-- 
cgit v1.2.3


From a5330eb3bcd87f3ce7738a2e898671a0ff561e85 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 27 Aug 2024 16:27:13 +0200
Subject: selftests: vDSO: separate LDLIBS from CFLAGS for libsodium

On systems that set -Wl,--as-needed, putting the -lsodium in the wrong
place on the command line means we get a linker error:

      CC       vdso_test_chacha
    /usr/bin/ld: /tmp/ccKpjnSM.o: in function `main':
    vdso_test_chacha.c:(.text+0x276): undefined reference to `crypto_stream_chacha20'
    collect2: error: ld returned 1 exit status

Fix this by passing pkg-config's --libs output to the LDFLAGS field
instead of the CFLAGS field, as is customary.

Reported-by: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/Makefile | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 180854eb9fec..834aa862ba2c 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 uname_M := $(shell uname -m 2>/dev/null || echo not)
 ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
-SODIUM := $(shell pkg-config --libs --cflags libsodium 2>/dev/null)
+SODIUM_LIBS := $(shell pkg-config --libs libsodium 2>/dev/null)
+SODIUM_CFLAGS := $(shell pkg-config --cflags libsodium 2>/dev/null)
 
 TEST_GEN_PROGS := vdso_test_gettimeofday
 TEST_GEN_PROGS += vdso_test_getcpu
@@ -13,7 +14,7 @@ endif
 TEST_GEN_PROGS += vdso_test_correctness
 ifeq ($(uname_M),x86_64)
 TEST_GEN_PROGS += vdso_test_getrandom
-ifneq ($(SODIUM),)
+ifneq ($(SODIUM_LIBS),)
 TEST_GEN_PROGS += vdso_test_chacha
 endif
 endif
@@ -41,8 +42,9 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
                                          -isystem $(top_srcdir)/include/uapi
 
 $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S
+$(OUTPUT)/vdso_test_chacha: LDLIBS += $(SODIUM_LIBS)
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
                                       -idirafter $(top_srcdir)/arch/$(ARCH)/include \
                                       -idirafter $(top_srcdir)/include \
                                       -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \
-                                      -Wa,--noexecstack $(SODIUM)
+                                      -Wa,--noexecstack $(SODIUM_CFLAGS)
-- 
cgit v1.2.3


From be9155154bc39e910ba8e56f32be299cf83e21d0 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 27 Aug 2024 16:52:36 +0200
Subject: selftests: vDSO: remove unnecessary command line defs from chacha
 test

CONFIG_FUNCTION_ALIGNMENT=0 is no longer necessary and BULID_VDSO wasn't
spelled right while BUILD_VDSO isn't necessary, so just remove these.

Reported-by: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 834aa862ba2c..d1452c7d6d4f 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -46,5 +46,4 @@ $(OUTPUT)/vdso_test_chacha: LDLIBS += $(SODIUM_LIBS)
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
                                       -idirafter $(top_srcdir)/arch/$(ARCH)/include \
                                       -idirafter $(top_srcdir)/include \
-                                      -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \
-                                      -Wa,--noexecstack $(SODIUM_CFLAGS)
+                                      -D__ASSEMBLY__ -Wa,--noexecstack $(SODIUM_CFLAGS)
-- 
cgit v1.2.3


From b90eeff1badd9c3d12123e2721ae08dfb689cda7 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Wed, 28 Aug 2024 13:29:57 +0200
Subject: selftests: vDSO: use KHDR_INCLUDES for UAPI headers for getrandom
 test

Building test_vdso_getrandom currently leads to following issue:

    In file included from /home/xry111/git-repos/linux/tools/include/linux/compiler_types.h:36,
                     from /home/xry111/git-repos/linux/include/uapi/linux/stddef.h:5,
                     from /home/xry111/git-repos/linux/include/uapi/linux/posix_types.h:5,
                     from /usr/include/asm/sigcontext.h:12,
                     from /usr/include/bits/sigcontext.h:30,
                     from /usr/include/signal.h:301,
                     from vdso_test_getrandom.c:14:
    /home/xry111/git-repos/linux/tools/include/linux/compiler-gcc.h:3:2: error: #error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
        3 | #error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
          |  ^~~~~

It's because the compiler_types.h inclusion in
include/uapi/linux/stddef.h is expected to be removed by the
header_install.sh script, as compiler_types.h shouldn't be used from
user space.

Add KHDR_INCLUDES before the existing include/uapi inclusion so that
usr/include takes precedence if it's populated.

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index d1452c7d6d4f..32ea4c2a9a23 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -39,6 +39,7 @@ $(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl
 
 $(OUTPUT)/vdso_test_getrandom: parse_vdso.c
 $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
+                                         $(KHDR_INCLUDES) \
                                          -isystem $(top_srcdir)/include/uapi
 
 $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S
-- 
cgit v1.2.3


From f78280b1a3cedd9f68d5f596179675514a15bd1d Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 29 Aug 2024 20:23:23 +0200
Subject: selftests: vDSO: skip getrandom test if architecture is unsupported

If the getrandom test compiles for an arch, don't exit fatally if the
actual cpu it's running on is unsupported.

Suggested-by: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_test_getrandom.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
index 20bbef992c48..5db8ac8999cd 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
@@ -104,6 +104,7 @@ static void vgetrandom_init(void)
 	const char *version = versions[VDSO_VERSION];
 	const char *name = names[VDSO_NAMES][6];
 	unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
+	size_t ret;
 
 	if (!sysinfo_ehdr) {
 		printf("AT_SYSINFO_EHDR is not present!\n");
@@ -115,7 +116,11 @@ static void vgetrandom_init(void)
 		printf("%s is missing!\n", name);
 		exit(KSFT_FAIL);
 	}
-	if (vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL) != 0) {
+	ret = vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL);
+	if (ret == -ENOSYS) {
+		printf("unsupported architecture\n");
+		exit(KSFT_SKIP);
+	} else if (ret) {
 		printf("failed to fetch vgetrandom params!\n");
 		exit(KSFT_FAIL);
 	}
-- 
cgit v1.2.3


From 59eb856c3ed9b3552befd240c0c339f22eed3fa1 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 30 Aug 2024 14:28:35 +0200
Subject: selftests: vDSO: fix vDSO name for powerpc

Following error occurs when running vdso_test_correctness on powerpc:

~ # ./vdso_test_correctness
[WARN]	failed to find vDSO
[SKIP]	No vDSO, so skipping clock_gettime() tests
[SKIP]	No vDSO, so skipping clock_gettime64() tests
[RUN]	Testing getcpu...
[OK]	CPU 0: syscall: cpu 0, node 0

On powerpc, vDSO is neither called linux-vdso.so.1 nor linux-gate.so.1
but linux-vdso32.so.1 or linux-vdso64.so.1.

Also search those two names before giving up.

Fixes: c7e5789b24d3 ("kselftest: Move test_vdso to the vDSO test suite")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_test_correctness.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_correctness.c b/tools/testing/selftests/vDSO/vdso_test_correctness.c
index e691a3cf1491..cdb697ae8343 100644
--- a/tools/testing/selftests/vDSO/vdso_test_correctness.c
+++ b/tools/testing/selftests/vDSO/vdso_test_correctness.c
@@ -114,6 +114,12 @@ static void fill_function_pointers()
 	if (!vdso)
 		vdso = dlopen("linux-gate.so.1",
 			      RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+	if (!vdso)
+		vdso = dlopen("linux-vdso32.so.1",
+			      RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+	if (!vdso)
+		vdso = dlopen("linux-vdso64.so.1",
+			      RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
 	if (!vdso) {
 		printf("[WARN]\tfailed to find vDSO\n");
 		return;
-- 
cgit v1.2.3


From 7d297c419b08eafa69ce27243ee9bbecab4fcaa4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 30 Aug 2024 14:28:36 +0200
Subject: selftests: vDSO: fix vdso_config for powerpc

Running vdso_test_correctness on powerpc64 gives the following warning:

  ~ # ./vdso_test_correctness
  Warning: failed to find clock_gettime64 in vDSO

This is because vdso_test_correctness was built with VDSO_32BIT defined.

__powerpc__ macro is defined on both powerpc32 and powerpc64 so
__powerpc64__ needs to be checked first in vdso_config.h

Fixes: 693f5ca08ca0 ("kselftest: Extend vDSO selftest")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_config.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h
index b5ba5ded3a61..740ce8c98d2e 100644
--- a/tools/testing/selftests/vDSO/vdso_config.h
+++ b/tools/testing/selftests/vDSO/vdso_config.h
@@ -18,13 +18,13 @@
 #elif defined(__aarch64__)
 #define VDSO_VERSION		3
 #define VDSO_NAMES		0
-#elif defined(__powerpc__)
+#elif defined(__powerpc64__)
 #define VDSO_VERSION		1
 #define VDSO_NAMES		0
-#define VDSO_32BIT		1
-#elif defined(__powerpc64__)
+#elif defined(__powerpc__)
 #define VDSO_VERSION		1
 #define VDSO_NAMES		0
+#define VDSO_32BIT		1
 #elif defined (__s390__)
 #define VDSO_VERSION		2
 #define VDSO_NAMES		0
-- 
cgit v1.2.3


From ba83b3239e657469709d15dcea5f9b65bf9dbf34 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 30 Aug 2024 14:28:37 +0200
Subject: selftests: vDSO: fix vDSO symbols lookup for powerpc64

On powerpc64, following tests fail locating vDSO functions:

  ~ # ./vdso_test_abi
  TAP version 13
  1..16
  # [vDSO kselftest] VDSO_VERSION: LINUX_2.6.15
  # Couldn't find __kernel_gettimeofday
  ok 1 # SKIP __kernel_gettimeofday
  # clock_id: CLOCK_REALTIME
  # Couldn't find __kernel_clock_gettime
  ok 2 # SKIP __kernel_clock_gettime CLOCK_REALTIME
  # Couldn't find __kernel_clock_getres
  ok 3 # SKIP __kernel_clock_getres CLOCK_REALTIME
  ...
  # Couldn't find __kernel_time
  ok 16 # SKIP __kernel_time
  # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:16 error:0

  ~ # ./vdso_test_getrandom
  __kernel_getrandom is missing!

  ~ # ./vdso_test_gettimeofday
  Could not find __kernel_gettimeofday

  ~ # ./vdso_test_getcpu
  Could not find __kernel_getcpu

On powerpc64, as shown below by readelf, vDSO functions symbols have
type NOTYPE, so also accept that type when looking for symbols.

$ powerpc64-linux-gnu-readelf -a arch/powerpc/kernel/vdso/vdso64.so.dbg
ELF Header:
  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
  Class:                             ELF64
  Data:                              2's complement, big endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              DYN (Shared object file)
  Machine:                           PowerPC64
  Version:                           0x1
...

Symbol table '.dynsym' contains 12 entries:
   Num:    Value          Size Type    Bind   Vis      Ndx Name
     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND
     1: 0000000000000524    84 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
     2: 00000000000005f0    36 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
     3: 0000000000000578    68 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
     4: 0000000000000000     0 OBJECT  GLOBAL DEFAULT  ABS LINUX_2.6.15
     5: 00000000000006c0    48 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
     6: 0000000000000614   172 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
     7: 00000000000006f0    84 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
     8: 000000000000047c    84 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
     9: 0000000000000454    12 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
    10: 00000000000004d0    84 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
    11: 00000000000005bc    52 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15

Symbol table '.symtab' contains 56 entries:
   Num:    Value          Size Type    Bind   Vis      Ndx Name
...
    45: 0000000000000000     0 OBJECT  GLOBAL DEFAULT  ABS LINUX_2.6.15
    46: 00000000000006c0    48 NOTYPE  GLOBAL DEFAULT    8 __kernel_getcpu
    47: 0000000000000524    84 NOTYPE  GLOBAL DEFAULT    8 __kernel_clock_getres
    48: 00000000000005f0    36 NOTYPE  GLOBAL DEFAULT    8 __kernel_get_tbfreq
    49: 000000000000047c    84 NOTYPE  GLOBAL DEFAULT    8 __kernel_gettimeofday
    50: 0000000000000614   172 NOTYPE  GLOBAL DEFAULT    8 __kernel_sync_dicache
    51: 00000000000006f0    84 NOTYPE  GLOBAL DEFAULT    8 __kernel_getrandom
    52: 0000000000000454    12 NOTYPE  GLOBAL DEFAULT    8 __kernel_sigtram[...]
    53: 0000000000000578    68 NOTYPE  GLOBAL DEFAULT    8 __kernel_time
    54: 00000000000004d0    84 NOTYPE  GLOBAL DEFAULT    8 __kernel_clock_g[...]
    55: 00000000000005bc    52 NOTYPE  GLOBAL DEFAULT    8 __kernel_get_sys[...]

Fixes: 98eedc3a9dbf ("Document the vDSO and add a reference parser")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/parse_vdso.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c
index 4ae417372e9e..d9ccc5acac18 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -216,7 +216,8 @@ void *vdso_sym(const char *version, const char *name)
 		ELF(Sym) *sym = &vdso_info.symtab[chain];
 
 		/* Check for a defined global or weak function w/ right name. */
-		if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
+		if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
+		    ELF64_ST_TYPE(sym->st_info) != STT_NOTYPE)
 			continue;
 		if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
 		    ELF64_ST_BIND(sym->st_info) != STB_WEAK)
-- 
cgit v1.2.3


From 6eda706a535c3d0119eaefaad5fc119609639ed2 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 30 Aug 2024 14:28:38 +0200
Subject: selftests: vDSO: fix the way vDSO functions are called for powerpc

vdso_test_correctness test fails on powerpc:

~ # ./vdso_test_correctness
...
[RUN]	Testing clock_gettime for clock CLOCK_REALTIME_ALARM (8)...
[FAIL]	No such clock, but __vdso_clock_gettime returned 22
[RUN]	Testing clock_gettime for clock CLOCK_BOOTTIME_ALARM (9)...
[FAIL]	No such clock, but __vdso_clock_gettime returned 22
[RUN]	Testing clock_gettime for clock CLOCK_SGI_CYCLE (10)...
[FAIL]	No such clock, but __vdso_clock_gettime returned 22
...
[RUN]	Testing clock_gettime for clock invalid (-1)...
[FAIL]	No such clock, but __vdso_clock_gettime returned 22
[RUN]	Testing clock_gettime for clock invalid (-2147483648)...
[FAIL]	No such clock, but __vdso_clock_gettime returned 22
[RUN]	Testing clock_gettime for clock invalid (2147483647)...
[FAIL]	No such clock, but __vdso_clock_gettime returned 22

On powerpc, a call to a VDSO function is not an ordinary C function
call. Unlike several architectures which returns a negative error code
in case of an error, powerpc sets CR[SO] and returns the error code
as a positive value.

Define and use a macro called VDSO_CALL() which takes a pointer
to the function to call, the number of arguments and the arguments.

Also update ABI vdso documentation to reflect this subtlety.

Provide a specific version of VDSO_CALL() for powerpc that negates
the error code on return when CR[SO] is set.

Fixes: c7e5789b24d3 ("kselftest: Move test_vdso to the vDSO test suite")
Fixes: 2e9a97256616 ("selftests: vdso: Add a selftest for vDSO getcpu()")
Fixes: 693f5ca08ca0 ("kselftest: Extend vDSO selftest")
Fixes: b2f1c3db2887 ("kselftest: Extend vdso correctness test to clock_gettime64")
Fixes: 4920a2590e91 ("selftests/vDSO: add tests for vgetrandom")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 Documentation/ABI/stable/vdso                      |  8 ++-
 tools/testing/selftests/vDSO/vdso_call.h           | 70 ++++++++++++++++++++++
 tools/testing/selftests/vDSO/vdso_test_abi.c       |  9 +--
 .../testing/selftests/vDSO/vdso_test_correctness.c | 15 ++---
 tools/testing/selftests/vDSO/vdso_test_getcpu.c    |  3 +-
 tools/testing/selftests/vDSO/vdso_test_getrandom.c |  5 +-
 .../selftests/vDSO/vdso_test_gettimeofday.c        |  3 +-
 7 files changed, 95 insertions(+), 18 deletions(-)
 create mode 100644 tools/testing/selftests/vDSO/vdso_call.h

(limited to 'tools')

diff --git a/Documentation/ABI/stable/vdso b/Documentation/ABI/stable/vdso
index 951838d42781..85dbb6a160df 100644
--- a/Documentation/ABI/stable/vdso
+++ b/Documentation/ABI/stable/vdso
@@ -9,9 +9,11 @@ maps an ELF DSO into that program's address space.  This DSO is called
 the vDSO and it often contains useful and highly-optimized alternatives
 to real syscalls.
 
-These functions are called just like ordinary C function according to
-your platform's ABI.  Call them from a sensible context.  (For example,
-if you set CS on x86 to something strange, the vDSO functions are
+These functions are called according to your platform's ABI. On many
+platforms they are called just like ordinary C function. On other platforms
+(ex: powerpc) they are called with the same convention as system calls which
+is different from ordinary C functions. Call them from a sensible context.
+(For example, if you set CS on x86 to something strange, the vDSO functions are
 within their rights to crash.)  In addition, if you pass a bad
 pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT.
 
diff --git a/tools/testing/selftests/vDSO/vdso_call.h b/tools/testing/selftests/vDSO/vdso_call.h
new file mode 100644
index 000000000000..bb237d771051
--- /dev/null
+++ b/tools/testing/selftests/vDSO/vdso_call.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Macro to call vDSO functions
+ *
+ * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France
+ */
+#ifndef __VDSO_CALL_H__
+#define __VDSO_CALL_H__
+
+#ifdef __powerpc__
+
+#define LOADARGS_1(fn, __arg1) do {					\
+	_r0 = fn;							\
+	_r3 = (long)__arg1;						\
+} while (0)
+
+#define LOADARGS_2(fn, __arg1, __arg2) do {				\
+	_r0 = fn;							\
+	_r3 = (long)__arg1;						\
+	_r4 = (long)__arg2;						\
+} while (0)
+
+#define LOADARGS_3(fn, __arg1, __arg2, __arg3) do {			\
+	_r0 = fn;							\
+	_r3 = (long)__arg1;						\
+	_r4 = (long)__arg2;						\
+	_r5 = (long)__arg3;						\
+} while (0)
+
+#define LOADARGS_5(fn, __arg1, __arg2, __arg3, __arg4, __arg5) do {	\
+	_r0 = fn;							\
+	_r3 = (long)__arg1;						\
+	_r4 = (long)__arg2;						\
+	_r5 = (long)__arg3;						\
+	_r6 = (long)__arg4;						\
+	_r7 = (long)__arg5;						\
+} while (0)
+
+#define VDSO_CALL(fn, nr, args...) ({					\
+	register void *_r0 asm ("r0");					\
+	register long _r3 asm ("r3");					\
+	register long _r4 asm ("r4");					\
+	register long _r5 asm ("r5");					\
+	register long _r6 asm ("r6");					\
+	register long _r7 asm ("r7");					\
+	register long _r8 asm ("r8");					\
+	register long _rval asm ("r3");					\
+									\
+	LOADARGS_##nr(fn, args);					\
+									\
+	asm volatile(							\
+		"	mtctr %0\n"					\
+		"	bctrl\n"					\
+		"	bns+	1f\n"					\
+		"	neg	3, 3\n"					\
+		"1:"							\
+		: "+r" (_r0), "=r" (_r3), "+r" (_r4), "+r" (_r5),	\
+		  "+r" (_r6), "+r" (_r7), "+r" (_r8)			\
+		: "r" (_rval)						\
+		: "r9", "r10", "r11", "r12", "cr0", "cr1", "cr5",	\
+		  "cr6", "cr7", "xer", "lr", "ctr", "memory"		\
+	);								\
+	_rval;								\
+})
+
+#else
+#define VDSO_CALL(fn, nr, args...)	fn(args)
+#endif
+
+#endif
diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c
index 96d32fd65b42..00034208c4c6 100644
--- a/tools/testing/selftests/vDSO/vdso_test_abi.c
+++ b/tools/testing/selftests/vDSO/vdso_test_abi.c
@@ -20,6 +20,7 @@
 
 #include "../kselftest.h"
 #include "vdso_config.h"
+#include "vdso_call.h"
 
 extern void *vdso_sym(const char *version, const char *name);
 extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
@@ -61,7 +62,7 @@ static void vdso_test_gettimeofday(void)
 	}
 
 	struct timeval tv;
-	long ret = vdso_gettimeofday(&tv, 0);
+	long ret = VDSO_CALL(vdso_gettimeofday, 2, &tv, 0);
 
 	if (ret == 0) {
 		ksft_print_msg("The time is %lld.%06lld\n",
@@ -86,7 +87,7 @@ static void vdso_test_clock_gettime(clockid_t clk_id)
 	}
 
 	struct timespec ts;
-	long ret = vdso_clock_gettime(clk_id, &ts);
+	long ret = VDSO_CALL(vdso_clock_gettime, 2, clk_id, &ts);
 
 	if (ret == 0) {
 		ksft_print_msg("The time is %lld.%06lld\n",
@@ -111,7 +112,7 @@ static void vdso_test_time(void)
 		return;
 	}
 
-	long ret = vdso_time(NULL);
+	long ret = VDSO_CALL(vdso_time, 1, NULL);
 
 	if (ret > 0) {
 		ksft_print_msg("The time in hours since January 1, 1970 is %lld\n",
@@ -138,7 +139,7 @@ static void vdso_test_clock_getres(clockid_t clk_id)
 	}
 
 	struct timespec ts, sys_ts;
-	long ret = vdso_clock_getres(clk_id, &ts);
+	long ret = VDSO_CALL(vdso_clock_getres, 2, clk_id, &ts);
 
 	if (ret == 0) {
 		ksft_print_msg("The vdso resolution is %lld %lld\n",
diff --git a/tools/testing/selftests/vDSO/vdso_test_correctness.c b/tools/testing/selftests/vDSO/vdso_test_correctness.c
index cdb697ae8343..5fb97ad67eea 100644
--- a/tools/testing/selftests/vDSO/vdso_test_correctness.c
+++ b/tools/testing/selftests/vDSO/vdso_test_correctness.c
@@ -20,6 +20,7 @@
 #include <limits.h>
 
 #include "vdso_config.h"
+#include "vdso_call.h"
 #include "../kselftest.h"
 
 static const char **name;
@@ -186,7 +187,7 @@ static void test_getcpu(void)
 
 		ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
 		if (vdso_getcpu)
-			ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
+			ret_vdso = VDSO_CALL(vdso_getcpu, 3, &cpu_vdso, &node_vdso, 0);
 		if (vgetcpu)
 			ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
 
@@ -269,7 +270,7 @@ static void test_one_clock_gettime(int clock, const char *name)
 
 	if (sys_clock_gettime(clock, &start) < 0) {
 		if (errno == EINVAL) {
-			vdso_ret = vdso_clock_gettime(clock, &vdso);
+			vdso_ret = VDSO_CALL(vdso_clock_gettime, 2, clock, &vdso);
 			if (vdso_ret == -EINVAL) {
 				printf("[OK]\tNo such clock.\n");
 			} else {
@@ -282,7 +283,7 @@ static void test_one_clock_gettime(int clock, const char *name)
 		return;
 	}
 
-	vdso_ret = vdso_clock_gettime(clock, &vdso);
+	vdso_ret = VDSO_CALL(vdso_clock_gettime, 2, clock, &vdso);
 	end_ret = sys_clock_gettime(clock, &end);
 
 	if (vdso_ret != 0 || end_ret != 0) {
@@ -331,7 +332,7 @@ static void test_one_clock_gettime64(int clock, const char *name)
 
 	if (sys_clock_gettime64(clock, &start) < 0) {
 		if (errno == EINVAL) {
-			vdso_ret = vdso_clock_gettime64(clock, &vdso);
+			vdso_ret = VDSO_CALL(vdso_clock_gettime64, 2, clock, &vdso);
 			if (vdso_ret == -EINVAL) {
 				printf("[OK]\tNo such clock.\n");
 			} else {
@@ -344,7 +345,7 @@ static void test_one_clock_gettime64(int clock, const char *name)
 		return;
 	}
 
-	vdso_ret = vdso_clock_gettime64(clock, &vdso);
+	vdso_ret = VDSO_CALL(vdso_clock_gettime64, 2, clock, &vdso);
 	end_ret = sys_clock_gettime64(clock, &end);
 
 	if (vdso_ret != 0 || end_ret != 0) {
@@ -401,7 +402,7 @@ static void test_gettimeofday(void)
 		return;
 	}
 
-	vdso_ret = vdso_gettimeofday(&vdso, &vdso_tz);
+	vdso_ret = VDSO_CALL(vdso_gettimeofday, 2, &vdso, &vdso_tz);
 	end_ret = sys_gettimeofday(&end, NULL);
 
 	if (vdso_ret != 0 || end_ret != 0) {
@@ -431,7 +432,7 @@ static void test_gettimeofday(void)
 	}
 
 	/* And make sure that passing NULL for tz doesn't crash. */
-	vdso_gettimeofday(&vdso, NULL);
+	VDSO_CALL(vdso_gettimeofday, 2, &vdso, NULL);
 }
 
 int main(int argc, char **argv)
diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
index b758f68c6c9c..cdeaed45fb26 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
@@ -14,6 +14,7 @@
 #include "../kselftest.h"
 #include "parse_vdso.h"
 #include "vdso_config.h"
+#include "vdso_call.h"
 
 struct getcpu_cache;
 typedef long (*getcpu_t)(unsigned int *, unsigned int *,
@@ -42,7 +43,7 @@ int main(int argc, char **argv)
 		return KSFT_SKIP;
 	}
 
-	ret = get_cpu(&cpu, &node, 0);
+	ret = VDSO_CALL(get_cpu, 3, &cpu, &node, 0);
 	if (ret == 0) {
 		printf("Running on CPU %u node %u\n", cpu, node);
 	} else {
diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
index 5db8ac8999cd..351daeb649c8 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
@@ -22,6 +22,7 @@
 #include "../kselftest.h"
 #include "parse_vdso.h"
 #include "vdso_config.h"
+#include "vdso_call.h"
 
 #ifndef timespecsub
 #define	timespecsub(tsp, usp, vsp)					\
@@ -116,7 +117,7 @@ static void vgetrandom_init(void)
 		printf("%s is missing!\n", name);
 		exit(KSFT_FAIL);
 	}
-	ret = vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL);
+	ret = VDSO_CALL(vgrnd.fn, 5, NULL, 0, 0, &vgrnd.params, ~0UL);
 	if (ret == -ENOSYS) {
 		printf("unsupported architecture\n");
 		exit(KSFT_SKIP);
@@ -137,7 +138,7 @@ static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags)
 			exit(KSFT_FAIL);
 		}
 	}
-	return vgrnd.fn(buf, len, flags, state, vgrnd.params.size_of_opaque_state);
+	return VDSO_CALL(vgrnd.fn, 5, buf, len, flags, state, vgrnd.params.size_of_opaque_state);
 }
 
 enum { TRIALS = 25000000, THREADS = 256 };
diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c
index ee4f1ca56a71..e31b18ffae33 100644
--- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c
+++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c
@@ -19,6 +19,7 @@
 #include "../kselftest.h"
 #include "parse_vdso.h"
 #include "vdso_config.h"
+#include "vdso_call.h"
 
 int main(int argc, char **argv)
 {
@@ -43,7 +44,7 @@ int main(int argc, char **argv)
 	}
 
 	struct timeval tv;
-	long ret = gtod(&tv, 0);
+	long ret = VDSO_CALL(gtod, 2, &tv, 0);
 
 	if (ret == 0) {
 		printf("The time is %lld.%06lld\n",
-- 
cgit v1.2.3


From f0d0dbbc101a5ed2cd844eae0c2cc0546327ef89 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 30 Aug 2024 14:28:39 +0200
Subject: selftests: vDSO: use parse_vdso.h in vdso_test_abi

Don't duplicate parse_vdso function prototypes, include
the header instead.

Fixes: 693f5ca08ca0 ("kselftest: Extend vDSO selftest")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_test_abi.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c
index 00034208c4c6..a54424e2336f 100644
--- a/tools/testing/selftests/vDSO/vdso_test_abi.c
+++ b/tools/testing/selftests/vDSO/vdso_test_abi.c
@@ -21,10 +21,7 @@
 #include "../kselftest.h"
 #include "vdso_config.h"
 #include "vdso_call.h"
-
-extern void *vdso_sym(const char *version, const char *name);
-extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
-extern void vdso_init_from_auxv(void *auxv);
+#include "parse_vdso.h"
 
 static const char *version;
 static const char **name;
-- 
cgit v1.2.3


From d736d4fc763090f9a02dc5556174de9768093f43 Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Fri, 30 Aug 2024 10:59:11 +0530
Subject: kselftest/arm64: Fix build warnings for ptrace

A "%s" is missing in ksft_exit_fail_msg(); instead, use the newly
introduced ksft_exit_fail_perror().

Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240830052911.4040970-1-dev.jain@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/abi/ptrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c
index e4fa507cbdd0..b51d21f78cf9 100644
--- a/tools/testing/selftests/arm64/abi/ptrace.c
+++ b/tools/testing/selftests/arm64/abi/ptrace.c
@@ -163,10 +163,10 @@ static void test_hw_debug(pid_t child, int type, const char *type_name)
 static int do_child(void)
 {
 	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
-		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+		ksft_exit_fail_perror("PTRACE_TRACEME");
 
 	if (raise(SIGSTOP))
-		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+		ksft_exit_fail_perror("raise(SIGSTOP)");
 
 	return EXIT_SUCCESS;
 }
-- 
cgit v1.2.3


From 33ffa2dd0de28ded1fa856b3fb9cbf2b873d7dd0 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 30 Aug 2024 17:21:36 +0200
Subject: selftests: vDSO: quash clang omitted parameter warning in getrandom
 test

When building with clang, there's this warning:

vdso_test_getrandom.c:145:40: warning: omitting the parameter name in a function definition is a C23 extension [-Wc23-extensions]
  145 | static void *test_vdso_getrandom(void *)
      |                                        ^
vdso_test_getrandom.c:155:40: warning: omitting the parameter name in a function definition is a C23 extension [-Wc23-extensions]
  155 | static void *test_libc_getrandom(void *)
      |                                        ^
vdso_test_getrandom.c:165:43: warning: omitting the parameter name in a function definition is a C23 extension [-Wc23-extensions]
  165 | static void *test_syscall_getrandom(void *)

Add the named ctx parameter to quash it.

Reported-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_test_getrandom.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
index 351daeb649c8..8866b65a4605 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
@@ -143,7 +143,7 @@ static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags)
 
 enum { TRIALS = 25000000, THREADS = 256 };
 
-static void *test_vdso_getrandom(void *)
+static void *test_vdso_getrandom(void *ctx)
 {
 	for (size_t i = 0; i < TRIALS; ++i) {
 		unsigned int val;
@@ -153,7 +153,7 @@ static void *test_vdso_getrandom(void *)
 	return NULL;
 }
 
-static void *test_libc_getrandom(void *)
+static void *test_libc_getrandom(void *ctx)
 {
 	for (size_t i = 0; i < TRIALS; ++i) {
 		unsigned int val;
@@ -163,7 +163,7 @@ static void *test_libc_getrandom(void *)
 	return NULL;
 }
 
-static void *test_syscall_getrandom(void *)
+static void *test_syscall_getrandom(void *ctx)
 {
 	for (size_t i = 0; i < TRIALS; ++i) {
 		unsigned int val;
-- 
cgit v1.2.3


From da18bfa59d403040d8bcba1284285916fe9e4425 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Fri, 30 Aug 2024 02:51:50 -0700
Subject: libbpf: Ensure new BTF objects inherit input endianness

New split BTF needs to preserve base's endianness. Similarly, when
creating a distilled BTF, we need to preserve original endianness.

Fix by updating libbpf's btf__distill_base() and btf_new_empty() to retain
the byte order of any source BTF objects when creating new ones.

Fixes: ba451366bf44 ("libbpf: Implement basic split BTF support")
Fixes: 58e185a0dc35 ("libbpf: Add btf__distill_base() creating split BTF with distilled base BTF")
Reported-by: Song Liu <song@kernel.org>
Reported-by: Eduard Zingerman <eddyz87@gmail.com>
Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/6358db36c5f68b07873a0a5be2d062b1af5ea5f8.camel@gmail.com/
Link: https://lore.kernel.org/bpf/20240830095150.278881-1-tony.ambardar@gmail.com
---
 tools/lib/bpf/btf.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 32c00db3b91b..40aae244e35f 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -996,6 +996,7 @@ static struct btf *btf_new_empty(struct btf *base_btf)
 		btf->base_btf = base_btf;
 		btf->start_id = btf__type_cnt(base_btf);
 		btf->start_str_off = base_btf->hdr->str_len;
+		btf->swapped_endian = base_btf->swapped_endian;
 	}
 
 	/* +1 for empty string at offset 0 */
@@ -5394,6 +5395,9 @@ int btf__distill_base(const struct btf *src_btf, struct btf **new_base_btf,
 	new_base = btf__new_empty();
 	if (!new_base)
 		return libbpf_err(-ENOMEM);
+
+	btf__set_endianness(new_base, btf__endianness(src_btf));
+
 	dist.id_map = calloc(n, sizeof(*dist.id_map));
 	if (!dist.id_map) {
 		err = -ENOMEM;
-- 
cgit v1.2.3


From 181b0d1af5d5b2ba8996fa715382cbfbfef46b6e Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Fri, 30 Aug 2024 10:34:06 -0700
Subject: selftests/bpf: Check if distilled base inherits source endianness

Create a BTF with endianness different from host, make a distilled
base/split BTF pair from it, dump as raw bytes, import again and
verify that endianness is preserved.

Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240830173406.1581007-1-eddyz87@gmail.com
---
 .../testing/selftests/bpf/prog_tests/btf_distill.c | 68 ++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_distill.c b/tools/testing/selftests/bpf/prog_tests/btf_distill.c
index bfbe795823a2..ca84726d5ac1 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_distill.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_distill.c
@@ -535,6 +535,72 @@ cleanup:
 	btf__free(vmlinux_btf);
 }
 
+/* Split and new base BTFs should inherit endianness from source BTF. */
+static void test_distilled_endianness(void)
+{
+	struct btf *base = NULL, *split = NULL, *new_base = NULL, *new_split = NULL;
+	struct btf *new_base1 = NULL, *new_split1 = NULL;
+	enum btf_endianness inverse_endianness;
+	const void *raw_data;
+	__u32 size;
+
+	base = btf__new_empty();
+	if (!ASSERT_OK_PTR(base, "empty_main_btf"))
+		return;
+	inverse_endianness = btf__endianness(base) == BTF_LITTLE_ENDIAN ? BTF_BIG_ENDIAN
+									: BTF_LITTLE_ENDIAN;
+	btf__set_endianness(base, inverse_endianness);
+	btf__add_int(base, "int", 4, BTF_INT_SIGNED);   /* [1] int */
+	VALIDATE_RAW_BTF(
+		base,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED");
+	split = btf__new_empty_split(base);
+	if (!ASSERT_OK_PTR(split, "empty_split_btf"))
+		goto cleanup;
+	btf__add_ptr(split, 1);
+	VALIDATE_RAW_BTF(
+		split,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] PTR '(anon)' type_id=1");
+	if (!ASSERT_EQ(0, btf__distill_base(split, &new_base, &new_split),
+		       "distilled_base") ||
+	    !ASSERT_OK_PTR(new_base, "distilled_base") ||
+	    !ASSERT_OK_PTR(new_split, "distilled_split") ||
+	    !ASSERT_EQ(2, btf__type_cnt(new_base), "distilled_base_type_cnt"))
+		goto cleanup;
+	VALIDATE_RAW_BTF(
+		new_split,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] PTR '(anon)' type_id=1");
+
+	raw_data = btf__raw_data(new_base, &size);
+	if (!ASSERT_OK_PTR(raw_data, "btf__raw_data #1"))
+		goto cleanup;
+	new_base1 = btf__new(raw_data, size);
+	if (!ASSERT_OK_PTR(new_base1, "new_base1 = btf__new()"))
+		goto cleanup;
+	raw_data = btf__raw_data(new_split, &size);
+	if (!ASSERT_OK_PTR(raw_data, "btf__raw_data #2"))
+		goto cleanup;
+	new_split1 = btf__new_split(raw_data, size, new_base1);
+	if (!ASSERT_OK_PTR(new_split1, "new_split1 = btf__new()"))
+		goto cleanup;
+
+	ASSERT_EQ(btf__endianness(new_base1), inverse_endianness, "new_base1 endianness");
+	ASSERT_EQ(btf__endianness(new_split1), inverse_endianness, "new_split1 endianness");
+	VALIDATE_RAW_BTF(
+		new_split1,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] PTR '(anon)' type_id=1");
+cleanup:
+	btf__free(new_split1);
+	btf__free(new_base1);
+	btf__free(new_split);
+	btf__free(new_base);
+	btf__free(split);
+	btf__free(base);
+}
+
 void test_btf_distill(void)
 {
 	if (test__start_subtest("distilled_base"))
@@ -549,4 +615,6 @@ void test_btf_distill(void)
 		test_distilled_base_multi_err2();
 	if (test__start_subtest("distilled_base_vmlinux"))
 		test_distilled_base_vmlinux();
+	if (test__start_subtest("distilled_endianness"))
+		test_distilled_endianness();
 }
-- 
cgit v1.2.3


From 38960ac8f916d1e60d4ceb4735a93740c4308d9c Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Wed, 28 Aug 2024 17:46:14 +0000
Subject: selftests/bpf: Specify libbpf headers required for %.bpf.o progs

Test %.bpf.o objects actually depend only on some libbpf headers.
Define a list of required headers and use it as TRUNNER_BPF_OBJS
dependency.

bpf_*.h list was determined by:

    $ grep -rh 'include <bpf/bpf_' progs | sort -u

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link:
Link: https://lore.kernel.org/bpf/20240828174608.377204-1-ihor.solodrai@pm.me

https://lore.kernel.org/bpf/CAEf4BzYQ-j2i_xjs94Nn=8+FVfkWt51mLZyiYKiz9oA4Z=pCeA@mail.gmail.com/
---
 tools/testing/selftests/bpf/Makefile | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index c120617b64ad..53cc13b92ee2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -516,6 +516,12 @@ xdp_features.skel.h-deps := xdp_features.bpf.o
 LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps))
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS))
 
+HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h)		\
+			$(addprefix $(BPFDIR)/,	bpf_core_read.h	\
+			                        bpf_endian.h	\
+						bpf_helpers.h	\
+			                        bpf_tracing.h)
+
 # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on
 # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES.
 # Parameters:
@@ -566,8 +572,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o:				\
 		     $(TRUNNER_BPF_PROGS_DIR)/%.c			\
 		     $(TRUNNER_BPF_PROGS_DIR)/*.h			\
 		     $$(INCLUDE_DIR)/vmlinux.h				\
-		     $(wildcard $(BPFDIR)/bpf_*.h)			\
-		     $(wildcard $(BPFDIR)/*.bpf.h)			\
+		     $(HEADERS_FOR_BPF_OBJS)				\
 		     | $(TRUNNER_OUTPUT) $$(BPFOBJ)
 	$$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@,			\
 					  $(TRUNNER_BPF_CFLAGS)         \
-- 
cgit v1.2.3


From 2ad6d23f465a4f851e3bcf6d74c315ce7b2c205b Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Wed, 28 Aug 2024 17:46:23 +0000
Subject: selftests/bpf: Do not update vmlinux.h unnecessarily

%.bpf.o objects depend on vmlinux.h, which makes them transitively
dependent on unnecessary libbpf headers. However vmlinux.h doesn't
actually change as often.

When generating vmlinux.h, compare it to a previous version and update
it only if there are changes.

Example of build time improvement (after first clean build):
  $ touch ../../../lib/bpf/bpf.h
  $ time make -j8
Before: real  1m37.592s
After:  real  0m27.310s

Notice that %.bpf.o gen step is skipped if vmlinux.h hasn't changed.

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/CAEf4BzY1z5cC7BKye8=A8aTVxpsCzD=p1jdTfKC7i0XVuYoHUQ@mail.gmail.com
Link: https://lore.kernel.org/bpf/20240828174608.377204-2-ihor.solodrai@pm.me
---
 tools/testing/selftests/bpf/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 53cc13b92ee2..7660d19b66c2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -399,10 +399,14 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
 		    DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers
 endif
 
+# vmlinux.h is first dumped to a temprorary file and then compared to
+# the previous version. This helps to avoid unnecessary re-builds of
+# $(TRUNNER_BPF_OBJS)
 $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
 ifeq ($(VMLINUX_H),)
 	$(call msg,GEN,,$@)
-	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $(INCLUDE_DIR)/.vmlinux.h.tmp
+	$(Q)cmp -s $(INCLUDE_DIR)/.vmlinux.h.tmp $@ || mv $(INCLUDE_DIR)/.vmlinux.h.tmp $@
 else
 	$(call msg,CP,,$@)
 	$(Q)cp "$(VMLINUX_H)" $@
-- 
cgit v1.2.3


From 43a17fcfcc4294f53ece15425ba362f5e28664c2 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 20 Aug 2024 15:55:36 -0400
Subject: selftest/cgroup: Make test_cpuset_prs.sh deal with pre-isolated CPUs

Since isolated CPUs can be reserved at boot time via the "isolcpus"
boot command line option, these pre-isolated CPUs may interfere with
testing done by test_cpuset_prs.sh.

With the previous commit that incorporates those boot time isolated CPUs
into "cpuset.cpus.isolated", we can check for those before testing is
started to make sure that there will be no interference.  Otherwise,
this test will be skipped if incorrect test failure can happen.

As "cpuset.cpus.isolated" is now available in a non cgroup_debug kernel,
we don't need to check for its existence anymore.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_cpuset_prs.sh | 44 +++++++++++++++++------
 1 file changed, 33 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index 7295424502b9..03c1bdaed2c3 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -84,6 +84,20 @@ echo member > test/cpuset.cpus.partition
 echo "" > test/cpuset.cpus
 [[ $RESULT -eq 0 ]] && skip_test "Child cgroups are using cpuset!"
 
+#
+# If isolated CPUs have been reserved at boot time (as shown in
+# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-7
+# that will be used by this script for testing purpose. If not, some of
+# the tests may fail incorrectly. These isolated CPUs will also be removed
+# before being compared with the expected results.
+#
+BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated)
+if [[ -n "$BOOT_ISOLCPUS" ]]
+then
+	[[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 7 ]] &&
+		skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested"
+	echo "Pre-isolated CPUs: $BOOT_ISOLCPUS"
+fi
 cleanup()
 {
 	online_cpus
@@ -642,7 +656,8 @@ check_cgroup_states()
 # Note that isolated CPUs from the sched/domains context include offline
 # CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may
 # not be included in the cpuset.cpus.isolated control file which contains
-# only CPUs in isolated partitions.
+# only CPUs in isolated partitions as well as those that are isolated at
+# boot time.
 #
 # $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>}
 # <isolcpus1> - expected sched/domains value
@@ -669,18 +684,21 @@ check_isolcpus()
 	fi
 
 	#
-	# Check the debug isolated cpumask, if present
+	# Check cpuset.cpus.isolated cpumask
 	#
-	[[ -f $ISCPUS ]] && {
+	if [[ -z "$BOOT_ISOLCPUS" ]]
+	then
+		ISOLCPUS=$(cat $ISCPUS)
+	else
+		ISOLCPUS=$(cat $ISCPUS | sed -e "s/,*$BOOT_ISOLCPUS//")
+	fi
+	[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && {
+		# Take a 50ms pause and try again
+		pause 0.05
 		ISOLCPUS=$(cat $ISCPUS)
-		[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && {
-			# Take a 50ms pause and try again
-			pause 0.05
-			ISOLCPUS=$(cat $ISCPUS)
-		}
-		[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1
-		ISOLCPUS=
 	}
+	[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1
+	ISOLCPUS=
 
 	#
 	# Use the sched domain in debugfs to check isolated CPUs, if available
@@ -713,6 +731,9 @@ check_isolcpus()
 		fi
 	done
 	[[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU
+	[[ -n "BOOT_ISOLCPUS" ]] &&
+		ISOLCPUS=$(echo $ISOLCPUS | sed -e "s/,*$BOOT_ISOLCPUS//")
+
 	[[ "$EXPECT_VAL" = "$ISOLCPUS" ]]
 }
 
@@ -730,7 +751,8 @@ test_fail()
 }
 
 #
-# Check to see if there are unexpected isolated CPUs left
+# Check to see if there are unexpected isolated CPUs left beyond the boot
+# time isolated ones.
 #
 null_isolcpus_check()
 {
-- 
cgit v1.2.3


From 3f9319c6914c45a2d406faf3aa8cd0dee8bc4c2f Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong@huawei.com>
Date: Fri, 30 Aug 2024 10:02:29 +0000
Subject: cgroup/cpuset: add sefltest for cpuset v1

There is only hotplug test for cpuset v1, just add base read/write test
for cpuset v1.

Signed-off-by: Chen Ridong <chenridong@huawei.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 MAINTAINERS                                        |  1 +
 .../selftests/cgroup/test_cpuset_v1_base.sh        | 77 ++++++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100755 tools/testing/selftests/cgroup/test_cpuset_v1_base.sh

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3b5ec1cafd95..b59f54e1e30d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5703,6 +5703,7 @@ F:	kernel/cgroup/cpuset-v1.c
 F:	kernel/cgroup/cpuset.c
 F:	tools/testing/selftests/cgroup/test_cpuset.c
 F:	tools/testing/selftests/cgroup/test_cpuset_prs.sh
+F:	tools/testing/selftests/cgroup/test_cpuset_v1_base.sh
 
 CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
 M:	Johannes Weiner <hannes@cmpxchg.org>
diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh
new file mode 100755
index 000000000000..42a6628fb8bc
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Basc test for cpuset v1 interfaces write/read
+#
+
+skip_test() {
+	echo "$1"
+	echo "Test SKIPPED"
+	exit 4 # ksft_skip
+}
+
+write_test() {
+	dir=$1
+	interface=$2
+	value=$3
+	original=$(cat $dir/$interface)
+	echo "testing $interface $value"
+	echo $value > $dir/$interface
+	new=$(cat $dir/$interface)
+	[[ $value -ne $(cat $dir/$interface) ]] && {
+		echo "$interface write $value failed: new:$new"
+		exit 1
+	}
+}
+
+[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
+
+# Find cpuset v1 mount point
+CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk '{print $3}')
+[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!"
+
+#
+# Create a test cpuset, read write test
+#
+TDIR=test$$
+[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR
+
+ITF_MATRIX=(
+	#interface			value		expect 	root_only
+	'cpuset.cpus			0-1		0-1	0'
+	'cpuset.mem_exclusive		1		1	0'
+	'cpuset.mem_exclusive		0		0	0'
+	'cpuset.mem_hardwall		1		1	0'
+	'cpuset.mem_hardwall		0		0	0'
+	'cpuset.memory_migrate		1		1	0'
+	'cpuset.memory_migrate		0		0	0'
+	'cpuset.memory_spread_page	1		1	0'
+	'cpuset.memory_spread_page	0		0	0'
+	'cpuset.memory_spread_slab	1		1	0'
+	'cpuset.memory_spread_slab	0		0	0'
+	'cpuset.mems			0		0	0'
+	'cpuset.sched_load_balance	1		1	0'
+	'cpuset.sched_load_balance	0		0	0'
+	'cpuset.sched_relax_domain_level	2	2	0'
+	'cpuset.memory_pressure_enabled	1		1	1'
+	'cpuset.memory_pressure_enabled	0		0	1'
+)
+
+run_test()
+{
+	cnt="${ITF_MATRIX[@]}"
+	for i in "${ITF_MATRIX[@]}" ; do
+		args=($i)
+		root_only=${args[3]}
+		[[ $root_only -eq 1 ]] && {
+			write_test "$CPUSET" "${args[0]}" "${args[1]}" "${args[2]}"
+			continue
+		}
+		write_test "$CPUSET/$TDIR" "${args[0]}" "${args[1]}" "${args[2]}"
+	done
+}
+
+run_test
+rmdir $CPUSET/$TDIR
+echo "Test PASSED"
+exit 0
-- 
cgit v1.2.3


From d075bccec082be326e08d8b4f7cd491029438f0b Mon Sep 17 00:00:00 2001
From: David Finkel <davidf@vimeo.com>
Date: Mon, 29 Jul 2024 10:37:43 -0400
Subject: mm, memcg: cg2 memory{.swap,}.peak write tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend two existing tests to cover extracting memory usage through the
newly mutable memory.peak and memory.swap.peak handlers.

In particular, make sure to exercise adding and removing watchers with
overlapping lifetimes so the less-trivial logic gets tested.

The new/updated tests attempt to detect a lack of the write handler by
fstat'ing the memory.peak and memory.swap.peak files and skip the tests if
that's the case.  Additionally, skip if the file doesn't exist at all.

[davidf@vimeo.com: update tests]
  Link: https://lkml.kernel.org/r/20240730231304.761942-3-davidf@vimeo.com
Link: https://lkml.kernel.org/r/20240729143743.34236-3-davidf@vimeo.com
Signed-off-by: David Finkel <davidf@vimeo.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/cgroup/cgroup_util.c     |  22 ++
 tools/testing/selftests/cgroup/cgroup_util.h     |   2 +
 tools/testing/selftests/cgroup/test_memcontrol.c | 264 ++++++++++++++++++++++-
 3 files changed, 280 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
index 432db923bced..1e2d46636a0c 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -141,6 +141,16 @@ long cg_read_long(const char *cgroup, const char *control)
 	return atol(buf);
 }
 
+long cg_read_long_fd(int fd)
+{
+	char buf[128];
+
+	if (pread(fd, buf, sizeof(buf), 0) <= 0)
+		return -1;
+
+	return atol(buf);
+}
+
 long cg_read_key_long(const char *cgroup, const char *control, const char *key)
 {
 	char buf[PAGE_SIZE];
@@ -183,6 +193,18 @@ int cg_write(const char *cgroup, const char *control, char *buf)
 	return ret == len ? 0 : ret;
 }
 
+/*
+ * Returns fd on success, or -1 on failure.
+ * (fd should be closed with close() as usual)
+ */
+int cg_open(const char *cgroup, const char *control, int flags)
+{
+	char path[PATH_MAX];
+
+	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
+	return open(path, flags);
+}
+
 int cg_write_numeric(const char *cgroup, const char *control, long value)
 {
 	char buf[64];
diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h
index e8d04ac9e3d2..19b131ee7707 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@@ -34,9 +34,11 @@ extern int cg_read_strcmp(const char *cgroup, const char *control,
 extern int cg_read_strstr(const char *cgroup, const char *control,
 			  const char *needle);
 extern long cg_read_long(const char *cgroup, const char *control);
+extern long cg_read_long_fd(int fd);
 long cg_read_key_long(const char *cgroup, const char *control, const char *key);
 extern long cg_read_lc(const char *cgroup, const char *control);
 extern int cg_write(const char *cgroup, const char *control, char *buf);
+extern int cg_open(const char *cgroup, const char *control, int flags);
 int cg_write_numeric(const char *cgroup, const char *control, long value);
 extern int cg_run(const char *cgroup,
 		  int (*fn)(const char *cgroup, void *arg),
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 41ae8047b889..16f5d74ae762 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -161,13 +161,16 @@ cleanup:
 /*
  * This test create a memory cgroup, allocates
  * some anonymous memory and some pagecache
- * and check memory.current and some memory.stat values.
+ * and checks memory.current, memory.peak, and some memory.stat values.
  */
-static int test_memcg_current(const char *root)
+static int test_memcg_current_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
-	long current;
+	long current, peak, peak_reset;
 	char *memcg;
+	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
+	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
+	struct stat ss;
 
 	memcg = cg_name(root, "memcg_test");
 	if (!memcg)
@@ -180,15 +183,124 @@ static int test_memcg_current(const char *root)
 	if (current != 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak != 0)
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	/*
+	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
+	 * We need at least three to be closed in a different order than writes occurred to test
+	 * the linked-list handling.
+	 */
+	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (peak_fd == -1) {
+		if (errno == ENOENT)
+			ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	/*
+	 * Before we try to use memory.peak's fd, try to figure out whether
+	 * this kernel supports writing to that file in the first place. (by
+	 * checking the writable bit on the file's st_mode)
+	 */
+	if (fstat(peak_fd, &ss))
+		goto cleanup;
+
+	if ((ss.st_mode & S_IWUSR) == 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (peak_fd2 == -1)
+		goto cleanup;
+
+	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (peak_fd3 == -1)
+		goto cleanup;
+
+	/* any non-empty string resets, but make it clear */
+	static const char reset_string[] = "reset\n";
+
+	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	fd2_closed = true;
+	if (close(peak_fd2))
+		goto cleanup;
+
+	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (peak_fd4 == -1)
+		goto cleanup;
+
+	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak = cg_read_long_fd(peak_fd);
+	if (peak > MB(30) || peak < 0)
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	/* Make sure everything is back to normal */
+	peak = cg_read_long_fd(peak_fd);
+	if (peak < MB(50))
+		goto cleanup;
+
+	peak = cg_read_long_fd(peak_fd4);
+	if (peak < MB(50))
+		goto cleanup;
+
+	fd3_closed = true;
+	if (close(peak_fd3))
+		goto cleanup;
+
+	fd4_closed = true;
+	if (close(peak_fd4))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
+	close(peak_fd);
+	if (!fd2_closed)
+		close(peak_fd2);
+	if (!fd3_closed)
+		close(peak_fd3);
+	if (!fd4_closed)
+		close(peak_fd4);
 	cg_destroy(memcg);
 	free(memcg);
 
@@ -817,13 +929,19 @@ cleanup:
 
 /*
  * This test checks that memory.swap.max limits the amount of
- * anonymous memory which can be swapped out.
+ * anonymous memory which can be swapped out. Additionally, it verifies that
+ * memory.swap.peak reflects the high watermark and can be reset.
  */
-static int test_memcg_swap_max(const char *root)
+static int test_memcg_swap_max_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *memcg;
-	long max;
+	long max, peak;
+	struct stat ss;
+	int swap_peak_fd = -1, mem_peak_fd = -1;
+
+	/* any non-empty string resets */
+	static const char reset_string[] = "foobarbaz";
 
 	if (!is_swap_enabled())
 		return KSFT_SKIP;
@@ -840,6 +958,61 @@ static int test_memcg_swap_max(const char *root)
 		goto cleanup;
 	}
 
+	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
+			       O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (swap_peak_fd == -1) {
+		if (errno == ENOENT)
+			ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	/*
+	 * Before we try to use memory.swap.peak's fd, try to figure out
+	 * whether this kernel supports writing to that file in the first
+	 * place. (by checking the writable bit on the file's st_mode)
+	 */
+	if (fstat(swap_peak_fd, &ss))
+		goto cleanup;
+
+	if ((ss.st_mode & S_IWUSR) == 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (mem_peak_fd == -1)
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.swap.peak"))
+		goto cleanup;
+
+	if (cg_read_long_fd(swap_peak_fd))
+		goto cleanup;
+
+	/* switch the swap and mem fds into local-peak tracking mode*/
+	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
+
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	if (cg_read_long_fd(swap_peak_fd))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak"))
+		goto cleanup;
+
+	if (cg_read_long_fd(mem_peak_fd))
+		goto cleanup;
+
+	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	if (cg_read_long_fd(mem_peak_fd))
+		goto cleanup;
+
 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
 		goto cleanup;
 
@@ -862,6 +1035,61 @@ static int test_memcg_swap_max(const char *root)
 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long_fd(mem_peak_fd);
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long_fd(swap_peak_fd);
+	if (peak < MB(29))
+		goto cleanup;
+
+	/*
+	 * open, reset and close the peak swap on another FD to make sure
+	 * multiple extant fds don't corrupt the linked-list
+	 */
+	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
+	if (peak_reset)
+		goto cleanup;
+
+	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
+	if (peak_reset)
+		goto cleanup;
+
+	/* actually reset on the fds */
+	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak = cg_read_long_fd(swap_peak_fd);
+	if (peak > MB(10))
+		goto cleanup;
+
+	/*
+	 * The cgroup is now empty, but there may be a page or two associated
+	 * with the open FD accounted to it.
+	 */
+	peak = cg_read_long_fd(mem_peak_fd);
+	if (peak > MB(1))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak") < MB(29))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
 		goto cleanup;
 
@@ -869,9 +1097,29 @@ static int test_memcg_swap_max(const char *root)
 	if (max <= 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long_fd(mem_peak_fd);
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long_fd(swap_peak_fd);
+	if (peak < MB(19))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
+	if (mem_peak_fd != -1 && close(mem_peak_fd))
+		ret = KSFT_FAIL;
+	if (swap_peak_fd != -1 && close(swap_peak_fd))
+		ret = KSFT_FAIL;
 	cg_destroy(memcg);
 	free(memcg);
 
@@ -1295,7 +1543,7 @@ struct memcg_test {
 	const char *name;
 } tests[] = {
 	T(test_memcg_subtree_control),
-	T(test_memcg_current),
+	T(test_memcg_current_peak),
 	T(test_memcg_min),
 	T(test_memcg_low),
 	T(test_memcg_high),
@@ -1303,7 +1551,7 @@ struct memcg_test {
 	T(test_memcg_max),
 	T(test_memcg_reclaim),
 	T(test_memcg_oom_events),
-	T(test_memcg_swap_max),
+	T(test_memcg_swap_max_peak),
 	T(test_memcg_sock),
 	T(test_memcg_oom_group_leaf_events),
 	T(test_memcg_oom_group_parent_events),
-- 
cgit v1.2.3


From 74579d8dab476b66cd28715e73832ab777f20984 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 29 Jul 2024 12:50:40 +0100
Subject: tools: separate out shared radix-tree components

The core components contained within the radix-tree tests which provide
shims for kernel headers and access to the maple tree are useful for
testing other things, so separate them out and make the radix tree tests
dependent on the shared components.

This lays the groundwork for us to add VMA tests of the newly introduced
vma.c file.

Link: https://lkml.kernel.org/r/1ee720c265808168e0d75608e687607d77c36719.1722251717.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Gow <davidgow@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kees Cook <kees@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rae Moar <rmoar@google.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/radix-tree/.gitignore                |   1 +
 tools/testing/radix-tree/Makefile                  |  72 +-----
 tools/testing/radix-tree/generated/autoconf.h      |   2 -
 tools/testing/radix-tree/linux.c                   | 271 ---------------------
 tools/testing/radix-tree/linux/bug.h               |   2 -
 tools/testing/radix-tree/linux/cpu.h               |   1 -
 tools/testing/radix-tree/linux/idr.h               |   1 -
 tools/testing/radix-tree/linux/init.h              |   2 -
 tools/testing/radix-tree/linux/kconfig.h           |   1 -
 tools/testing/radix-tree/linux/kernel.h            |  29 ---
 tools/testing/radix-tree/linux/kmemleak.h          |   1 -
 tools/testing/radix-tree/linux/local_lock.h        |   8 -
 tools/testing/radix-tree/linux/lockdep.h           |  16 --
 tools/testing/radix-tree/linux/maple_tree.h        |   7 -
 tools/testing/radix-tree/linux/percpu.h            |  11 -
 tools/testing/radix-tree/linux/preempt.h           |  15 --
 tools/testing/radix-tree/linux/radix-tree.h        |  26 --
 tools/testing/radix-tree/linux/rcupdate.h          |  12 -
 tools/testing/radix-tree/linux/xarray.h            |   2 -
 tools/testing/radix-tree/trace/events/maple_tree.h |   5 -
 tools/testing/radix-tree/xarray.c                  |  10 +-
 tools/testing/shared/autoconf.h                    |   2 +
 tools/testing/shared/linux.c                       | 271 +++++++++++++++++++++
 tools/testing/shared/linux/bug.h                   |   2 +
 tools/testing/shared/linux/cpu.h                   |   1 +
 tools/testing/shared/linux/idr.h                   |   1 +
 tools/testing/shared/linux/init.h                  |   2 +
 tools/testing/shared/linux/kconfig.h               |   1 +
 tools/testing/shared/linux/kernel.h                |  29 +++
 tools/testing/shared/linux/kmemleak.h              |   1 +
 tools/testing/shared/linux/local_lock.h            |   8 +
 tools/testing/shared/linux/lockdep.h               |  16 ++
 tools/testing/shared/linux/maple_tree.h            |   7 +
 tools/testing/shared/linux/percpu.h                |  11 +
 tools/testing/shared/linux/preempt.h               |  15 ++
 tools/testing/shared/linux/radix-tree.h            |  26 ++
 tools/testing/shared/linux/rcupdate.h              |  12 +
 tools/testing/shared/linux/xarray.h                |   2 +
 tools/testing/shared/maple-shared.h                |   9 +
 tools/testing/shared/maple-shim.c                  |   7 +
 tools/testing/shared/shared.h                      |  33 +++
 tools/testing/shared/shared.mk                     |  72 ++++++
 tools/testing/shared/trace/events/maple_tree.h     |   5 +
 tools/testing/shared/xarray-shared.c               |   5 +
 tools/testing/shared/xarray-shared.h               |   4 +
 45 files changed, 556 insertions(+), 481 deletions(-)
 delete mode 100644 tools/testing/radix-tree/generated/autoconf.h
 delete mode 100644 tools/testing/radix-tree/linux.c
 delete mode 100644 tools/testing/radix-tree/linux/bug.h
 delete mode 100644 tools/testing/radix-tree/linux/cpu.h
 delete mode 100644 tools/testing/radix-tree/linux/idr.h
 delete mode 100644 tools/testing/radix-tree/linux/init.h
 delete mode 100644 tools/testing/radix-tree/linux/kconfig.h
 delete mode 100644 tools/testing/radix-tree/linux/kernel.h
 delete mode 100644 tools/testing/radix-tree/linux/kmemleak.h
 delete mode 100644 tools/testing/radix-tree/linux/local_lock.h
 delete mode 100644 tools/testing/radix-tree/linux/lockdep.h
 delete mode 100644 tools/testing/radix-tree/linux/maple_tree.h
 delete mode 100644 tools/testing/radix-tree/linux/percpu.h
 delete mode 100644 tools/testing/radix-tree/linux/preempt.h
 delete mode 100644 tools/testing/radix-tree/linux/radix-tree.h
 delete mode 100644 tools/testing/radix-tree/linux/rcupdate.h
 delete mode 100644 tools/testing/radix-tree/linux/xarray.h
 delete mode 100644 tools/testing/radix-tree/trace/events/maple_tree.h
 create mode 100644 tools/testing/shared/autoconf.h
 create mode 100644 tools/testing/shared/linux.c
 create mode 100644 tools/testing/shared/linux/bug.h
 create mode 100644 tools/testing/shared/linux/cpu.h
 create mode 100644 tools/testing/shared/linux/idr.h
 create mode 100644 tools/testing/shared/linux/init.h
 create mode 100644 tools/testing/shared/linux/kconfig.h
 create mode 100644 tools/testing/shared/linux/kernel.h
 create mode 100644 tools/testing/shared/linux/kmemleak.h
 create mode 100644 tools/testing/shared/linux/local_lock.h
 create mode 100644 tools/testing/shared/linux/lockdep.h
 create mode 100644 tools/testing/shared/linux/maple_tree.h
 create mode 100644 tools/testing/shared/linux/percpu.h
 create mode 100644 tools/testing/shared/linux/preempt.h
 create mode 100644 tools/testing/shared/linux/radix-tree.h
 create mode 100644 tools/testing/shared/linux/rcupdate.h
 create mode 100644 tools/testing/shared/linux/xarray.h
 create mode 100644 tools/testing/shared/maple-shared.h
 create mode 100644 tools/testing/shared/maple-shim.c
 create mode 100644 tools/testing/shared/shared.h
 create mode 100644 tools/testing/shared/shared.mk
 create mode 100644 tools/testing/shared/trace/events/maple_tree.h
 create mode 100644 tools/testing/shared/xarray-shared.c
 create mode 100644 tools/testing/shared/xarray-shared.h

(limited to 'tools')

diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore
index 49bccb90c35b..ce167a761981 100644
--- a/tools/testing/radix-tree/.gitignore
+++ b/tools/testing/radix-tree/.gitignore
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
+generated/autoconf.h
 generated/bit-length.h
 generated/map-shift.h
 idr.c
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index d1acd7d58850..8b3591a51e1f 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -1,77 +1,29 @@
 # SPDX-License-Identifier: GPL-2.0
 
-CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \
-	  -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined
-LDFLAGS += -fsanitize=address -fsanitize=undefined
-LDLIBS+= -lpthread -lurcu
-TARGETS = main idr-test multiorder xarray maple
-LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o
-CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o maple.o $(LIBS)
-OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
-	 regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \
-	 iteration_check_2.o benchmark.o
-
-ifndef SHIFT
-	SHIFT=3
-endif
+.PHONY: clean
 
-ifeq ($(BUILD), 32)
-	CFLAGS += -m32
-	LDFLAGS += -m32
-LONG_BIT := 32
-endif
-
-ifndef LONG_BIT
-LONG_BIT := $(shell getconf LONG_BIT)
-endif
+TARGETS = main idr-test multiorder xarray maple
+CORE_OFILES = $(SHARED_OFILES) xarray.o maple.o test.o
+OFILES = main.o $(CORE_OFILES) regression1.o regression2.o \
+	 regression3.o regression4.o tag_check.o multiorder.o idr-test.o \
+	iteration_check.o iteration_check_2.o benchmark.o
 
 targets: generated/map-shift.h generated/bit-length.h $(TARGETS)
 
+include ../shared/shared.mk
+
 main:	$(OFILES)
 
 idr-test.o: ../../../lib/test_ida.c
 idr-test: idr-test.o $(CORE_OFILES)
 
-xarray: $(CORE_OFILES)
+xarray: $(CORE_OFILES) xarray.o
 
-maple: $(CORE_OFILES)
+maple: $(CORE_OFILES) maple.o
 
 multiorder: multiorder.o $(CORE_OFILES)
 
 clean:
-	$(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h generated/bit-length.h
-
-vpath %.c ../../lib
-
-$(OFILES): Makefile *.h */*.h generated/map-shift.h generated/bit-length.h \
-	../../include/linux/*.h \
-	../../include/asm/*.h \
-	../../../include/linux/xarray.h \
-	../../../include/linux/maple_tree.h \
-	../../../include/linux/radix-tree.h \
-	../../../lib/radix-tree.h \
-	../../../include/linux/idr.h
-
-radix-tree.c: ../../../lib/radix-tree.c
-	sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
-
-idr.c: ../../../lib/idr.c
-	sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
-
-xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c
-
-maple.o: ../../../lib/maple_tree.c ../../../lib/test_maple_tree.c
-
-generated/map-shift.h:
-	@if ! grep -qws $(SHIFT) generated/map-shift.h; then		\
-		echo "#define XA_CHUNK_SHIFT $(SHIFT)" >		\
-				generated/map-shift.h;			\
-	fi
-
-generated/bit-length.h: FORCE
-	@if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then   \
-		echo "Generating $@";                                        \
-		echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@;                 \
-	fi
+	$(RM) $(TARGETS) *.o radix-tree.c idr.c generated/*
 
-FORCE: ;
+$(OFILES): $(SHARED_DEPS) *.h
diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h
deleted file mode 100644
index 92dc474c349b..000000000000
--- a/tools/testing/radix-tree/generated/autoconf.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#include "bit-length.h"
-#define CONFIG_XARRAY_MULTI 1
diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c
deleted file mode 100644
index 4eb442206d01..000000000000
--- a/tools/testing/radix-tree/linux.c
+++ /dev/null
@@ -1,271 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <stdlib.h>
-#include <string.h>
-#include <malloc.h>
-#include <pthread.h>
-#include <unistd.h>
-#include <assert.h>
-
-#include <linux/gfp.h>
-#include <linux/poison.h>
-#include <linux/slab.h>
-#include <linux/radix-tree.h>
-#include <urcu/uatomic.h>
-
-int nr_allocated;
-int preempt_count;
-int test_verbose;
-
-struct kmem_cache {
-	pthread_mutex_t lock;
-	unsigned int size;
-	unsigned int align;
-	int nr_objs;
-	void *objs;
-	void (*ctor)(void *);
-	unsigned int non_kernel;
-	unsigned long nr_allocated;
-	unsigned long nr_tallocated;
-};
-
-void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val)
-{
-	cachep->non_kernel = val;
-}
-
-unsigned long kmem_cache_get_alloc(struct kmem_cache *cachep)
-{
-	return cachep->size * cachep->nr_allocated;
-}
-
-unsigned long kmem_cache_nr_allocated(struct kmem_cache *cachep)
-{
-	return cachep->nr_allocated;
-}
-
-unsigned long kmem_cache_nr_tallocated(struct kmem_cache *cachep)
-{
-	return cachep->nr_tallocated;
-}
-
-void kmem_cache_zero_nr_tallocated(struct kmem_cache *cachep)
-{
-	cachep->nr_tallocated = 0;
-}
-
-void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
-		int gfp)
-{
-	void *p;
-
-	if (!(gfp & __GFP_DIRECT_RECLAIM)) {
-		if (!cachep->non_kernel)
-			return NULL;
-
-		cachep->non_kernel--;
-	}
-
-	pthread_mutex_lock(&cachep->lock);
-	if (cachep->nr_objs) {
-		struct radix_tree_node *node = cachep->objs;
-		cachep->nr_objs--;
-		cachep->objs = node->parent;
-		pthread_mutex_unlock(&cachep->lock);
-		node->parent = NULL;
-		p = node;
-	} else {
-		pthread_mutex_unlock(&cachep->lock);
-		if (cachep->align)
-			posix_memalign(&p, cachep->align, cachep->size);
-		else
-			p = malloc(cachep->size);
-		if (cachep->ctor)
-			cachep->ctor(p);
-		else if (gfp & __GFP_ZERO)
-			memset(p, 0, cachep->size);
-	}
-
-	uatomic_inc(&cachep->nr_allocated);
-	uatomic_inc(&nr_allocated);
-	uatomic_inc(&cachep->nr_tallocated);
-	if (kmalloc_verbose)
-		printf("Allocating %p from slab\n", p);
-	return p;
-}
-
-void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp)
-{
-	assert(objp);
-	if (cachep->nr_objs > 10 || cachep->align) {
-		memset(objp, POISON_FREE, cachep->size);
-		free(objp);
-	} else {
-		struct radix_tree_node *node = objp;
-		cachep->nr_objs++;
-		node->parent = cachep->objs;
-		cachep->objs = node;
-	}
-}
-
-void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp)
-{
-	uatomic_dec(&nr_allocated);
-	uatomic_dec(&cachep->nr_allocated);
-	if (kmalloc_verbose)
-		printf("Freeing %p to slab\n", objp);
-	__kmem_cache_free_locked(cachep, objp);
-}
-
-void kmem_cache_free(struct kmem_cache *cachep, void *objp)
-{
-	pthread_mutex_lock(&cachep->lock);
-	kmem_cache_free_locked(cachep, objp);
-	pthread_mutex_unlock(&cachep->lock);
-}
-
-void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list)
-{
-	if (kmalloc_verbose)
-		pr_debug("Bulk free %p[0-%lu]\n", list, size - 1);
-
-	pthread_mutex_lock(&cachep->lock);
-	for (int i = 0; i < size; i++)
-		kmem_cache_free_locked(cachep, list[i]);
-	pthread_mutex_unlock(&cachep->lock);
-}
-
-void kmem_cache_shrink(struct kmem_cache *cachep)
-{
-}
-
-int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size,
-			  void **p)
-{
-	size_t i;
-
-	if (kmalloc_verbose)
-		pr_debug("Bulk alloc %lu\n", size);
-
-	pthread_mutex_lock(&cachep->lock);
-	if (cachep->nr_objs >= size) {
-		struct radix_tree_node *node;
-
-		for (i = 0; i < size; i++) {
-			if (!(gfp & __GFP_DIRECT_RECLAIM)) {
-				if (!cachep->non_kernel)
-					break;
-				cachep->non_kernel--;
-			}
-
-			node = cachep->objs;
-			cachep->nr_objs--;
-			cachep->objs = node->parent;
-			p[i] = node;
-			node->parent = NULL;
-		}
-		pthread_mutex_unlock(&cachep->lock);
-	} else {
-		pthread_mutex_unlock(&cachep->lock);
-		for (i = 0; i < size; i++) {
-			if (!(gfp & __GFP_DIRECT_RECLAIM)) {
-				if (!cachep->non_kernel)
-					break;
-				cachep->non_kernel--;
-			}
-
-			if (cachep->align) {
-				posix_memalign(&p[i], cachep->align,
-					       cachep->size);
-			} else {
-				p[i] = malloc(cachep->size);
-				if (!p[i])
-					break;
-			}
-			if (cachep->ctor)
-				cachep->ctor(p[i]);
-			else if (gfp & __GFP_ZERO)
-				memset(p[i], 0, cachep->size);
-		}
-	}
-
-	if (i < size) {
-		size = i;
-		pthread_mutex_lock(&cachep->lock);
-		for (i = 0; i < size; i++)
-			__kmem_cache_free_locked(cachep, p[i]);
-		pthread_mutex_unlock(&cachep->lock);
-		return 0;
-	}
-
-	for (i = 0; i < size; i++) {
-		uatomic_inc(&nr_allocated);
-		uatomic_inc(&cachep->nr_allocated);
-		uatomic_inc(&cachep->nr_tallocated);
-		if (kmalloc_verbose)
-			printf("Allocating %p from slab\n", p[i]);
-	}
-
-	return size;
-}
-
-struct kmem_cache *
-kmem_cache_create(const char *name, unsigned int size, unsigned int align,
-		unsigned int flags, void (*ctor)(void *))
-{
-	struct kmem_cache *ret = malloc(sizeof(*ret));
-
-	pthread_mutex_init(&ret->lock, NULL);
-	ret->size = size;
-	ret->align = align;
-	ret->nr_objs = 0;
-	ret->nr_allocated = 0;
-	ret->nr_tallocated = 0;
-	ret->objs = NULL;
-	ret->ctor = ctor;
-	ret->non_kernel = 0;
-	return ret;
-}
-
-/*
- * Test the test infrastructure for kem_cache_alloc/free and bulk counterparts.
- */
-void test_kmem_cache_bulk(void)
-{
-	int i;
-	void *list[12];
-	static struct kmem_cache *test_cache, *test_cache2;
-
-	/*
-	 * Testing the bulk allocators without aligned kmem_cache to force the
-	 * bulk alloc/free to reuse
-	 */
-	test_cache = kmem_cache_create("test_cache", 256, 0, SLAB_PANIC, NULL);
-
-	for (i = 0; i < 5; i++)
-		list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM);
-
-	for (i = 0; i < 5; i++)
-		kmem_cache_free(test_cache, list[i]);
-	assert(test_cache->nr_objs == 5);
-
-	kmem_cache_alloc_bulk(test_cache, __GFP_DIRECT_RECLAIM, 5, list);
-	kmem_cache_free_bulk(test_cache, 5, list);
-
-	for (i = 0; i < 12 ; i++)
-		list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM);
-
-	for (i = 0; i < 12; i++)
-		kmem_cache_free(test_cache, list[i]);
-
-	/* The last free will not be kept around */
-	assert(test_cache->nr_objs == 11);
-
-	/* Aligned caches will immediately free */
-	test_cache2 = kmem_cache_create("test_cache2", 128, 128, SLAB_PANIC, NULL);
-
-	kmem_cache_alloc_bulk(test_cache2, __GFP_DIRECT_RECLAIM, 10, list);
-	kmem_cache_free_bulk(test_cache2, 10, list);
-	assert(!test_cache2->nr_objs);
-
-
-}
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h
deleted file mode 100644
index 03dc8a57eb99..000000000000
--- a/tools/testing/radix-tree/linux/bug.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#include <stdio.h>
-#include "asm/bug.h"
diff --git a/tools/testing/radix-tree/linux/cpu.h b/tools/testing/radix-tree/linux/cpu.h
deleted file mode 100644
index a45530d78107..000000000000
--- a/tools/testing/radix-tree/linux/cpu.h
+++ /dev/null
@@ -1 +0,0 @@
-#define cpuhp_setup_state_nocalls(a, b, c, d)	(0)
diff --git a/tools/testing/radix-tree/linux/idr.h b/tools/testing/radix-tree/linux/idr.h
deleted file mode 100644
index 4e342f2e37cf..000000000000
--- a/tools/testing/radix-tree/linux/idr.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../../include/linux/idr.h"
diff --git a/tools/testing/radix-tree/linux/init.h b/tools/testing/radix-tree/linux/init.h
deleted file mode 100644
index 81563c3dfce7..000000000000
--- a/tools/testing/radix-tree/linux/init.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define __init
-#define __exit
diff --git a/tools/testing/radix-tree/linux/kconfig.h b/tools/testing/radix-tree/linux/kconfig.h
deleted file mode 100644
index 6c8675859913..000000000000
--- a/tools/testing/radix-tree/linux/kconfig.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../../include/linux/kconfig.h"
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
deleted file mode 100644
index c0a2bb785b92..000000000000
--- a/tools/testing/radix-tree/linux/kernel.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _KERNEL_H
-#define _KERNEL_H
-
-#include "../../include/linux/kernel.h"
-#include <string.h>
-#include <stdio.h>
-#include <limits.h>
-
-#include <linux/compiler.h>
-#include <linux/err.h>
-#include <linux/bitops.h>
-#include <linux/log2.h>
-#include "../../../include/linux/kconfig.h"
-
-#define printk printf
-#define pr_err printk
-#define pr_info printk
-#define pr_debug printk
-#define pr_cont printk
-#define schedule()
-#define PAGE_SHIFT	12
-
-#define __acquires(x)
-#define __releases(x)
-#define __must_hold(x)
-
-#define EXPORT_PER_CPU_SYMBOL_GPL(x)
-#endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/kmemleak.h b/tools/testing/radix-tree/linux/kmemleak.h
deleted file mode 100644
index 155f112786c4..000000000000
--- a/tools/testing/radix-tree/linux/kmemleak.h
+++ /dev/null
@@ -1 +0,0 @@
-static inline void kmemleak_update_trace(const void *ptr) { }
diff --git a/tools/testing/radix-tree/linux/local_lock.h b/tools/testing/radix-tree/linux/local_lock.h
deleted file mode 100644
index b3cf8b233ca4..000000000000
--- a/tools/testing/radix-tree/linux/local_lock.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _LINUX_LOCAL_LOCK
-#define _LINUX_LOCAL_LOCK
-typedef struct { } local_lock_t;
-
-static inline void local_lock(local_lock_t *lock) { }
-static inline void local_unlock(local_lock_t *lock) { }
-#define INIT_LOCAL_LOCK(x) { }
-#endif
diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h
deleted file mode 100644
index 62473ab57f99..000000000000
--- a/tools/testing/radix-tree/linux/lockdep.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _LINUX_LOCKDEP_H
-#define _LINUX_LOCKDEP_H
-
-#include <linux/spinlock.h>
-
-struct lock_class_key {
-	unsigned int a;
-};
-
-static inline void lockdep_set_class(spinlock_t *lock,
-					struct lock_class_key *key)
-{
-}
-
-extern int lockdep_is_held(const void *);
-#endif /* _LINUX_LOCKDEP_H */
diff --git a/tools/testing/radix-tree/linux/maple_tree.h b/tools/testing/radix-tree/linux/maple_tree.h
deleted file mode 100644
index 06c89bdcc515..000000000000
--- a/tools/testing/radix-tree/linux/maple_tree.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-#define atomic_t int32_t
-#define atomic_inc(x) uatomic_inc(x)
-#define atomic_read(x) uatomic_read(x)
-#define atomic_set(x, y) do {} while (0)
-#define U8_MAX UCHAR_MAX
-#include "../../../../include/linux/maple_tree.h"
diff --git a/tools/testing/radix-tree/linux/percpu.h b/tools/testing/radix-tree/linux/percpu.h
deleted file mode 100644
index b2403aa743b2..000000000000
--- a/tools/testing/radix-tree/linux/percpu.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#define DECLARE_PER_CPU(type, val) extern type val
-#define DEFINE_PER_CPU(type, val) type val
-
-#define __get_cpu_var(var)	var
-#define this_cpu_ptr(var)	var
-#define this_cpu_read(var)	var
-#define this_cpu_xchg(var, val)		uatomic_xchg(&var, val)
-#define this_cpu_cmpxchg(var, old, new)	uatomic_cmpxchg(&var, old, new)
-#define per_cpu_ptr(ptr, cpu)   ({ (void)(cpu); (ptr); })
-#define per_cpu(var, cpu)	(*per_cpu_ptr(&(var), cpu))
diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h
deleted file mode 100644
index edb10302b903..000000000000
--- a/tools/testing/radix-tree/linux/preempt.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_PREEMPT_H
-#define __LINUX_PREEMPT_H
-
-extern int preempt_count;
-
-#define preempt_disable()	uatomic_inc(&preempt_count)
-#define preempt_enable()	uatomic_dec(&preempt_count)
-
-static inline int in_interrupt(void)
-{
-	return 0;
-}
-
-#endif /* __LINUX_PREEMPT_H */
diff --git a/tools/testing/radix-tree/linux/radix-tree.h b/tools/testing/radix-tree/linux/radix-tree.h
deleted file mode 100644
index d1635a5bef02..000000000000
--- a/tools/testing/radix-tree/linux/radix-tree.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _TEST_RADIX_TREE_H
-#define _TEST_RADIX_TREE_H
-
-#include "../../../../include/linux/radix-tree.h"
-
-extern int kmalloc_verbose;
-extern int test_verbose;
-
-static inline void trace_call_rcu(struct rcu_head *head,
-		void (*func)(struct rcu_head *head))
-{
-	if (kmalloc_verbose)
-		printf("Delaying free of %p to slab\n", (char *)head -
-				offsetof(struct radix_tree_node, rcu_head));
-	call_rcu(head, func);
-}
-
-#define printv(verbosity_level, fmt, ...) \
-	if(test_verbose >= verbosity_level) \
-		printf(fmt, ##__VA_ARGS__)
-
-#undef call_rcu
-#define call_rcu(x, y) trace_call_rcu(x, y)
-
-#endif /* _TEST_RADIX_TREE_H */
diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h
deleted file mode 100644
index fed468fb0c78..000000000000
--- a/tools/testing/radix-tree/linux/rcupdate.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _RCUPDATE_H
-#define _RCUPDATE_H
-
-#include <urcu.h>
-
-#define rcu_dereference_raw(p) rcu_dereference(p)
-#define rcu_dereference_protected(p, cond) rcu_dereference(p)
-#define rcu_dereference_check(p, cond) rcu_dereference(p)
-#define RCU_INIT_POINTER(p, v)	do { (p) = (v); } while (0)
-
-#endif
diff --git a/tools/testing/radix-tree/linux/xarray.h b/tools/testing/radix-tree/linux/xarray.h
deleted file mode 100644
index df3812cda376..000000000000
--- a/tools/testing/radix-tree/linux/xarray.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#include "generated/map-shift.h"
-#include "../../../../include/linux/xarray.h"
diff --git a/tools/testing/radix-tree/trace/events/maple_tree.h b/tools/testing/radix-tree/trace/events/maple_tree.h
deleted file mode 100644
index 97d0e1ddcf08..000000000000
--- a/tools/testing/radix-tree/trace/events/maple_tree.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-
-#define trace_ma_op(a, b) do {} while (0)
-#define trace_ma_read(a, b) do {} while (0)
-#define trace_ma_write(a, b, c, d) do {} while (0)
diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c
index d0e53bff1eb6..253208a8541b 100644
--- a/tools/testing/radix-tree/xarray.c
+++ b/tools/testing/radix-tree/xarray.c
@@ -4,17 +4,9 @@
  * Copyright (c) 2018 Matthew Wilcox <willy@infradead.org>
  */
 
-#define XA_DEBUG
+#include "xarray-shared.h"
 #include "test.h"
 
-#define module_init(x)
-#define module_exit(x)
-#define MODULE_AUTHOR(x)
-#define MODULE_DESCRIPTION(X)
-#define MODULE_LICENSE(x)
-#define dump_stack()	assert(0)
-
-#include "../../../lib/xarray.c"
 #undef XA_DEBUG
 #include "../../../lib/test_xarray.c"
 
diff --git a/tools/testing/shared/autoconf.h b/tools/testing/shared/autoconf.h
new file mode 100644
index 000000000000..92dc474c349b
--- /dev/null
+++ b/tools/testing/shared/autoconf.h
@@ -0,0 +1,2 @@
+#include "bit-length.h"
+#define CONFIG_XARRAY_MULTI 1
diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c
new file mode 100644
index 000000000000..4eb442206d01
--- /dev/null
+++ b/tools/testing/shared/linux.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <linux/gfp.h>
+#include <linux/poison.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <urcu/uatomic.h>
+
+int nr_allocated;
+int preempt_count;
+int test_verbose;
+
+struct kmem_cache {
+	pthread_mutex_t lock;
+	unsigned int size;
+	unsigned int align;
+	int nr_objs;
+	void *objs;
+	void (*ctor)(void *);
+	unsigned int non_kernel;
+	unsigned long nr_allocated;
+	unsigned long nr_tallocated;
+};
+
+void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val)
+{
+	cachep->non_kernel = val;
+}
+
+unsigned long kmem_cache_get_alloc(struct kmem_cache *cachep)
+{
+	return cachep->size * cachep->nr_allocated;
+}
+
+unsigned long kmem_cache_nr_allocated(struct kmem_cache *cachep)
+{
+	return cachep->nr_allocated;
+}
+
+unsigned long kmem_cache_nr_tallocated(struct kmem_cache *cachep)
+{
+	return cachep->nr_tallocated;
+}
+
+void kmem_cache_zero_nr_tallocated(struct kmem_cache *cachep)
+{
+	cachep->nr_tallocated = 0;
+}
+
+void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
+		int gfp)
+{
+	void *p;
+
+	if (!(gfp & __GFP_DIRECT_RECLAIM)) {
+		if (!cachep->non_kernel)
+			return NULL;
+
+		cachep->non_kernel--;
+	}
+
+	pthread_mutex_lock(&cachep->lock);
+	if (cachep->nr_objs) {
+		struct radix_tree_node *node = cachep->objs;
+		cachep->nr_objs--;
+		cachep->objs = node->parent;
+		pthread_mutex_unlock(&cachep->lock);
+		node->parent = NULL;
+		p = node;
+	} else {
+		pthread_mutex_unlock(&cachep->lock);
+		if (cachep->align)
+			posix_memalign(&p, cachep->align, cachep->size);
+		else
+			p = malloc(cachep->size);
+		if (cachep->ctor)
+			cachep->ctor(p);
+		else if (gfp & __GFP_ZERO)
+			memset(p, 0, cachep->size);
+	}
+
+	uatomic_inc(&cachep->nr_allocated);
+	uatomic_inc(&nr_allocated);
+	uatomic_inc(&cachep->nr_tallocated);
+	if (kmalloc_verbose)
+		printf("Allocating %p from slab\n", p);
+	return p;
+}
+
+void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp)
+{
+	assert(objp);
+	if (cachep->nr_objs > 10 || cachep->align) {
+		memset(objp, POISON_FREE, cachep->size);
+		free(objp);
+	} else {
+		struct radix_tree_node *node = objp;
+		cachep->nr_objs++;
+		node->parent = cachep->objs;
+		cachep->objs = node;
+	}
+}
+
+void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp)
+{
+	uatomic_dec(&nr_allocated);
+	uatomic_dec(&cachep->nr_allocated);
+	if (kmalloc_verbose)
+		printf("Freeing %p to slab\n", objp);
+	__kmem_cache_free_locked(cachep, objp);
+}
+
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+{
+	pthread_mutex_lock(&cachep->lock);
+	kmem_cache_free_locked(cachep, objp);
+	pthread_mutex_unlock(&cachep->lock);
+}
+
+void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list)
+{
+	if (kmalloc_verbose)
+		pr_debug("Bulk free %p[0-%lu]\n", list, size - 1);
+
+	pthread_mutex_lock(&cachep->lock);
+	for (int i = 0; i < size; i++)
+		kmem_cache_free_locked(cachep, list[i]);
+	pthread_mutex_unlock(&cachep->lock);
+}
+
+void kmem_cache_shrink(struct kmem_cache *cachep)
+{
+}
+
+int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size,
+			  void **p)
+{
+	size_t i;
+
+	if (kmalloc_verbose)
+		pr_debug("Bulk alloc %lu\n", size);
+
+	pthread_mutex_lock(&cachep->lock);
+	if (cachep->nr_objs >= size) {
+		struct radix_tree_node *node;
+
+		for (i = 0; i < size; i++) {
+			if (!(gfp & __GFP_DIRECT_RECLAIM)) {
+				if (!cachep->non_kernel)
+					break;
+				cachep->non_kernel--;
+			}
+
+			node = cachep->objs;
+			cachep->nr_objs--;
+			cachep->objs = node->parent;
+			p[i] = node;
+			node->parent = NULL;
+		}
+		pthread_mutex_unlock(&cachep->lock);
+	} else {
+		pthread_mutex_unlock(&cachep->lock);
+		for (i = 0; i < size; i++) {
+			if (!(gfp & __GFP_DIRECT_RECLAIM)) {
+				if (!cachep->non_kernel)
+					break;
+				cachep->non_kernel--;
+			}
+
+			if (cachep->align) {
+				posix_memalign(&p[i], cachep->align,
+					       cachep->size);
+			} else {
+				p[i] = malloc(cachep->size);
+				if (!p[i])
+					break;
+			}
+			if (cachep->ctor)
+				cachep->ctor(p[i]);
+			else if (gfp & __GFP_ZERO)
+				memset(p[i], 0, cachep->size);
+		}
+	}
+
+	if (i < size) {
+		size = i;
+		pthread_mutex_lock(&cachep->lock);
+		for (i = 0; i < size; i++)
+			__kmem_cache_free_locked(cachep, p[i]);
+		pthread_mutex_unlock(&cachep->lock);
+		return 0;
+	}
+
+	for (i = 0; i < size; i++) {
+		uatomic_inc(&nr_allocated);
+		uatomic_inc(&cachep->nr_allocated);
+		uatomic_inc(&cachep->nr_tallocated);
+		if (kmalloc_verbose)
+			printf("Allocating %p from slab\n", p[i]);
+	}
+
+	return size;
+}
+
+struct kmem_cache *
+kmem_cache_create(const char *name, unsigned int size, unsigned int align,
+		unsigned int flags, void (*ctor)(void *))
+{
+	struct kmem_cache *ret = malloc(sizeof(*ret));
+
+	pthread_mutex_init(&ret->lock, NULL);
+	ret->size = size;
+	ret->align = align;
+	ret->nr_objs = 0;
+	ret->nr_allocated = 0;
+	ret->nr_tallocated = 0;
+	ret->objs = NULL;
+	ret->ctor = ctor;
+	ret->non_kernel = 0;
+	return ret;
+}
+
+/*
+ * Test the test infrastructure for kem_cache_alloc/free and bulk counterparts.
+ */
+void test_kmem_cache_bulk(void)
+{
+	int i;
+	void *list[12];
+	static struct kmem_cache *test_cache, *test_cache2;
+
+	/*
+	 * Testing the bulk allocators without aligned kmem_cache to force the
+	 * bulk alloc/free to reuse
+	 */
+	test_cache = kmem_cache_create("test_cache", 256, 0, SLAB_PANIC, NULL);
+
+	for (i = 0; i < 5; i++)
+		list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM);
+
+	for (i = 0; i < 5; i++)
+		kmem_cache_free(test_cache, list[i]);
+	assert(test_cache->nr_objs == 5);
+
+	kmem_cache_alloc_bulk(test_cache, __GFP_DIRECT_RECLAIM, 5, list);
+	kmem_cache_free_bulk(test_cache, 5, list);
+
+	for (i = 0; i < 12 ; i++)
+		list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM);
+
+	for (i = 0; i < 12; i++)
+		kmem_cache_free(test_cache, list[i]);
+
+	/* The last free will not be kept around */
+	assert(test_cache->nr_objs == 11);
+
+	/* Aligned caches will immediately free */
+	test_cache2 = kmem_cache_create("test_cache2", 128, 128, SLAB_PANIC, NULL);
+
+	kmem_cache_alloc_bulk(test_cache2, __GFP_DIRECT_RECLAIM, 10, list);
+	kmem_cache_free_bulk(test_cache2, 10, list);
+	assert(!test_cache2->nr_objs);
+
+
+}
diff --git a/tools/testing/shared/linux/bug.h b/tools/testing/shared/linux/bug.h
new file mode 100644
index 000000000000..03dc8a57eb99
--- /dev/null
+++ b/tools/testing/shared/linux/bug.h
@@ -0,0 +1,2 @@
+#include <stdio.h>
+#include "asm/bug.h"
diff --git a/tools/testing/shared/linux/cpu.h b/tools/testing/shared/linux/cpu.h
new file mode 100644
index 000000000000..a45530d78107
--- /dev/null
+++ b/tools/testing/shared/linux/cpu.h
@@ -0,0 +1 @@
+#define cpuhp_setup_state_nocalls(a, b, c, d)	(0)
diff --git a/tools/testing/shared/linux/idr.h b/tools/testing/shared/linux/idr.h
new file mode 100644
index 000000000000..4e342f2e37cf
--- /dev/null
+++ b/tools/testing/shared/linux/idr.h
@@ -0,0 +1 @@
+#include "../../../../include/linux/idr.h"
diff --git a/tools/testing/shared/linux/init.h b/tools/testing/shared/linux/init.h
new file mode 100644
index 000000000000..81563c3dfce7
--- /dev/null
+++ b/tools/testing/shared/linux/init.h
@@ -0,0 +1,2 @@
+#define __init
+#define __exit
diff --git a/tools/testing/shared/linux/kconfig.h b/tools/testing/shared/linux/kconfig.h
new file mode 100644
index 000000000000..6c8675859913
--- /dev/null
+++ b/tools/testing/shared/linux/kconfig.h
@@ -0,0 +1 @@
+#include "../../../../include/linux/kconfig.h"
diff --git a/tools/testing/shared/linux/kernel.h b/tools/testing/shared/linux/kernel.h
new file mode 100644
index 000000000000..c0a2bb785b92
--- /dev/null
+++ b/tools/testing/shared/linux/kernel.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_H
+#define _KERNEL_H
+
+#include "../../include/linux/kernel.h"
+#include <string.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+#include "../../../include/linux/kconfig.h"
+
+#define printk printf
+#define pr_err printk
+#define pr_info printk
+#define pr_debug printk
+#define pr_cont printk
+#define schedule()
+#define PAGE_SHIFT	12
+
+#define __acquires(x)
+#define __releases(x)
+#define __must_hold(x)
+
+#define EXPORT_PER_CPU_SYMBOL_GPL(x)
+#endif /* _KERNEL_H */
diff --git a/tools/testing/shared/linux/kmemleak.h b/tools/testing/shared/linux/kmemleak.h
new file mode 100644
index 000000000000..155f112786c4
--- /dev/null
+++ b/tools/testing/shared/linux/kmemleak.h
@@ -0,0 +1 @@
+static inline void kmemleak_update_trace(const void *ptr) { }
diff --git a/tools/testing/shared/linux/local_lock.h b/tools/testing/shared/linux/local_lock.h
new file mode 100644
index 000000000000..b3cf8b233ca4
--- /dev/null
+++ b/tools/testing/shared/linux/local_lock.h
@@ -0,0 +1,8 @@
+#ifndef _LINUX_LOCAL_LOCK
+#define _LINUX_LOCAL_LOCK
+typedef struct { } local_lock_t;
+
+static inline void local_lock(local_lock_t *lock) { }
+static inline void local_unlock(local_lock_t *lock) { }
+#define INIT_LOCAL_LOCK(x) { }
+#endif
diff --git a/tools/testing/shared/linux/lockdep.h b/tools/testing/shared/linux/lockdep.h
new file mode 100644
index 000000000000..62473ab57f99
--- /dev/null
+++ b/tools/testing/shared/linux/lockdep.h
@@ -0,0 +1,16 @@
+#ifndef _LINUX_LOCKDEP_H
+#define _LINUX_LOCKDEP_H
+
+#include <linux/spinlock.h>
+
+struct lock_class_key {
+	unsigned int a;
+};
+
+static inline void lockdep_set_class(spinlock_t *lock,
+					struct lock_class_key *key)
+{
+}
+
+extern int lockdep_is_held(const void *);
+#endif /* _LINUX_LOCKDEP_H */
diff --git a/tools/testing/shared/linux/maple_tree.h b/tools/testing/shared/linux/maple_tree.h
new file mode 100644
index 000000000000..06c89bdcc515
--- /dev/null
+++ b/tools/testing/shared/linux/maple_tree.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#define atomic_t int32_t
+#define atomic_inc(x) uatomic_inc(x)
+#define atomic_read(x) uatomic_read(x)
+#define atomic_set(x, y) do {} while (0)
+#define U8_MAX UCHAR_MAX
+#include "../../../../include/linux/maple_tree.h"
diff --git a/tools/testing/shared/linux/percpu.h b/tools/testing/shared/linux/percpu.h
new file mode 100644
index 000000000000..b2403aa743b2
--- /dev/null
+++ b/tools/testing/shared/linux/percpu.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define DECLARE_PER_CPU(type, val) extern type val
+#define DEFINE_PER_CPU(type, val) type val
+
+#define __get_cpu_var(var)	var
+#define this_cpu_ptr(var)	var
+#define this_cpu_read(var)	var
+#define this_cpu_xchg(var, val)		uatomic_xchg(&var, val)
+#define this_cpu_cmpxchg(var, old, new)	uatomic_cmpxchg(&var, old, new)
+#define per_cpu_ptr(ptr, cpu)   ({ (void)(cpu); (ptr); })
+#define per_cpu(var, cpu)	(*per_cpu_ptr(&(var), cpu))
diff --git a/tools/testing/shared/linux/preempt.h b/tools/testing/shared/linux/preempt.h
new file mode 100644
index 000000000000..edb10302b903
--- /dev/null
+++ b/tools/testing/shared/linux/preempt.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_PREEMPT_H
+#define __LINUX_PREEMPT_H
+
+extern int preempt_count;
+
+#define preempt_disable()	uatomic_inc(&preempt_count)
+#define preempt_enable()	uatomic_dec(&preempt_count)
+
+static inline int in_interrupt(void)
+{
+	return 0;
+}
+
+#endif /* __LINUX_PREEMPT_H */
diff --git a/tools/testing/shared/linux/radix-tree.h b/tools/testing/shared/linux/radix-tree.h
new file mode 100644
index 000000000000..d1635a5bef02
--- /dev/null
+++ b/tools/testing/shared/linux/radix-tree.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TEST_RADIX_TREE_H
+#define _TEST_RADIX_TREE_H
+
+#include "../../../../include/linux/radix-tree.h"
+
+extern int kmalloc_verbose;
+extern int test_verbose;
+
+static inline void trace_call_rcu(struct rcu_head *head,
+		void (*func)(struct rcu_head *head))
+{
+	if (kmalloc_verbose)
+		printf("Delaying free of %p to slab\n", (char *)head -
+				offsetof(struct radix_tree_node, rcu_head));
+	call_rcu(head, func);
+}
+
+#define printv(verbosity_level, fmt, ...) \
+	if(test_verbose >= verbosity_level) \
+		printf(fmt, ##__VA_ARGS__)
+
+#undef call_rcu
+#define call_rcu(x, y) trace_call_rcu(x, y)
+
+#endif /* _TEST_RADIX_TREE_H */
diff --git a/tools/testing/shared/linux/rcupdate.h b/tools/testing/shared/linux/rcupdate.h
new file mode 100644
index 000000000000..fed468fb0c78
--- /dev/null
+++ b/tools/testing/shared/linux/rcupdate.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RCUPDATE_H
+#define _RCUPDATE_H
+
+#include <urcu.h>
+
+#define rcu_dereference_raw(p) rcu_dereference(p)
+#define rcu_dereference_protected(p, cond) rcu_dereference(p)
+#define rcu_dereference_check(p, cond) rcu_dereference(p)
+#define RCU_INIT_POINTER(p, v)	do { (p) = (v); } while (0)
+
+#endif
diff --git a/tools/testing/shared/linux/xarray.h b/tools/testing/shared/linux/xarray.h
new file mode 100644
index 000000000000..df3812cda376
--- /dev/null
+++ b/tools/testing/shared/linux/xarray.h
@@ -0,0 +1,2 @@
+#include "generated/map-shift.h"
+#include "../../../../include/linux/xarray.h"
diff --git a/tools/testing/shared/maple-shared.h b/tools/testing/shared/maple-shared.h
new file mode 100644
index 000000000000..3d847edd149d
--- /dev/null
+++ b/tools/testing/shared/maple-shared.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#define CONFIG_DEBUG_MAPLE_TREE
+#define CONFIG_MAPLE_SEARCH
+#define MAPLE_32BIT (MAPLE_NODE_SLOTS > 31)
+#include "shared.h"
+#include <stdlib.h>
+#include <time.h>
+#include "linux/init.h"
diff --git a/tools/testing/shared/maple-shim.c b/tools/testing/shared/maple-shim.c
new file mode 100644
index 000000000000..640df76f483e
--- /dev/null
+++ b/tools/testing/shared/maple-shim.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* Very simple shim around the maple tree. */
+
+#include "maple-shared.h"
+
+#include "../../../lib/maple_tree.c"
diff --git a/tools/testing/shared/shared.h b/tools/testing/shared/shared.h
new file mode 100644
index 000000000000..f08f683812ad
--- /dev/null
+++ b/tools/testing/shared/shared.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+
+#include <linux/gfp.h>
+#include <linux/rcupdate.h>
+
+#ifndef module_init
+#define module_init(x)
+#endif
+
+#ifndef module_exit
+#define module_exit(x)
+#endif
+
+#ifndef MODULE_AUTHOR
+#define MODULE_AUTHOR(x)
+#endif
+
+#ifndef MODULE_LICENSE
+#define MODULE_LICENSE(x)
+#endif
+
+#ifndef MODULE_DESCRIPTION
+#define MODULE_DESCRIPTION(x)
+#endif
+
+#ifndef dump_stack
+#define dump_stack()	assert(0)
+#endif
diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk
new file mode 100644
index 000000000000..a05f0588513a
--- /dev/null
+++ b/tools/testing/shared/shared.mk
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -I../shared -I. -I../../include -I../../../lib -g -Og -Wall \
+	  -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined
+LDFLAGS += -fsanitize=address -fsanitize=undefined
+LDLIBS += -lpthread -lurcu
+LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o
+SHARED_OFILES = xarray-shared.o radix-tree.o idr.o linux.o $(LIBS)
+
+SHARED_DEPS = Makefile ../shared/shared.mk ../shared/*.h generated/map-shift.h \
+	generated/bit-length.h generated/autoconf.h \
+	../../include/linux/*.h \
+	../../include/asm/*.h \
+	../../../include/linux/xarray.h \
+	../../../include/linux/maple_tree.h \
+	../../../include/linux/radix-tree.h \
+	../../../lib/radix-tree.h \
+	../../../include/linux/idr.h
+
+ifndef SHIFT
+	SHIFT=3
+endif
+
+ifeq ($(BUILD), 32)
+	CFLAGS += -m32
+	LDFLAGS += -m32
+LONG_BIT := 32
+endif
+
+ifndef LONG_BIT
+LONG_BIT := $(shell getconf LONG_BIT)
+endif
+
+%.o: ../shared/%.c
+	$(CC) -c $(CFLAGS) $< -o $@
+
+vpath %.c ../../lib
+
+$(SHARED_OFILES): $(SHARED_DEPS)
+
+radix-tree.c: ../../../lib/radix-tree.c
+	sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
+
+idr.c: ../../../lib/idr.c
+	sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
+
+xarray-shared.o: ../shared/xarray-shared.c ../../../lib/xarray.c \
+	../../../lib/test_xarray.c
+
+maple-shared.o: ../shared/maple-shared.c ../../../lib/maple_tree.c \
+	../../../lib/test_maple_tree.c
+
+generated/autoconf.h:
+	@mkdir -p generated
+	cp ../shared/autoconf.h generated/autoconf.h
+
+generated/map-shift.h:
+	@mkdir -p generated
+	@if ! grep -qws $(SHIFT) generated/map-shift.h; then            \
+		echo "Generating $@";                                   \
+		echo "#define XA_CHUNK_SHIFT $(SHIFT)" >                \
+				generated/map-shift.h;                  \
+	fi
+
+generated/bit-length.h: FORCE
+	@mkdir -p generated
+	@if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then   \
+		echo "Generating $@";                                        \
+		echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@;                 \
+	fi
+
+FORCE: ;
diff --git a/tools/testing/shared/trace/events/maple_tree.h b/tools/testing/shared/trace/events/maple_tree.h
new file mode 100644
index 000000000000..97d0e1ddcf08
--- /dev/null
+++ b/tools/testing/shared/trace/events/maple_tree.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#define trace_ma_op(a, b) do {} while (0)
+#define trace_ma_read(a, b) do {} while (0)
+#define trace_ma_write(a, b, c, d) do {} while (0)
diff --git a/tools/testing/shared/xarray-shared.c b/tools/testing/shared/xarray-shared.c
new file mode 100644
index 000000000000..e90901958dcd
--- /dev/null
+++ b/tools/testing/shared/xarray-shared.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "xarray-shared.h"
+
+#include "../../../lib/xarray.c"
diff --git a/tools/testing/shared/xarray-shared.h b/tools/testing/shared/xarray-shared.h
new file mode 100644
index 000000000000..ac2d16ff53ae
--- /dev/null
+++ b/tools/testing/shared/xarray-shared.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#define XA_DEBUG
+#include "shared.h"
-- 
cgit v1.2.3


From 9325b8b5a1cba1fbabe6e8217e248c5f90fd0e13 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 29 Jul 2024 12:50:41 +0100
Subject: tools: add skeleton code for userland testing of VMA logic

Establish a new userland VMA unit testing implementation under
tools/testing which utilises existing logic providing maple tree support
in userland utilising the now-shared code previously exclusive to radix
tree testing.

This provides fundamental VMA operations whose API is defined in mm/vma.h,
while stubbing out superfluous functionality.

This exists as a proof-of-concept, with the test implementation functional
and sufficient to allow userland compilation of vma.c, but containing only
cursory tests to demonstrate basic functionality.

Link: https://lkml.kernel.org/r/533ffa2eec771cbe6b387dd049a7f128a53eb616.1722251717.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: SeongJae Park <sj@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Gow <davidgow@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kees Cook <kees@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rae Moar <rmoar@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                      |   1 +
 tools/testing/vma/.gitignore     |   7 +
 tools/testing/vma/Makefile       |  16 +
 tools/testing/vma/linux/atomic.h |  12 +
 tools/testing/vma/linux/mmzone.h |  38 ++
 tools/testing/vma/vma.c          | 207 +++++++++
 tools/testing/vma/vma_internal.h | 882 +++++++++++++++++++++++++++++++++++++++
 7 files changed, 1163 insertions(+)
 create mode 100644 tools/testing/vma/.gitignore
 create mode 100644 tools/testing/vma/Makefile
 create mode 100644 tools/testing/vma/linux/atomic.h
 create mode 100644 tools/testing/vma/linux/mmzone.h
 create mode 100644 tools/testing/vma/vma.c
 create mode 100644 tools/testing/vma/vma_internal.h

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index 4a7b6e25bc77..41b15083e4a1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24423,6 +24423,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
 F:	mm/vma.c
 F:	mm/vma.h
 F:	mm/vma_internal.h
+F:	tools/testing/vma/
 
 VMALLOC
 M:	Andrew Morton <akpm@linux-foundation.org>
diff --git a/tools/testing/vma/.gitignore b/tools/testing/vma/.gitignore
new file mode 100644
index 000000000000..b003258eba79
--- /dev/null
+++ b/tools/testing/vma/.gitignore
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+generated/bit-length.h
+generated/map-shift.h
+generated/autoconf.h
+idr.c
+radix-tree.c
+vma
diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
new file mode 100644
index 000000000000..bfc905d222cf
--- /dev/null
+++ b/tools/testing/vma/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+.PHONY: default
+
+default: vma
+
+include ../shared/shared.mk
+
+OFILES = $(SHARED_OFILES) vma.o maple-shim.o
+TARGETS = vma
+
+vma:	$(OFILES) vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
+	$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
+
+clean:
+	$(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h generated/bit-length.h generated/autoconf.h
diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h
new file mode 100644
index 000000000000..e01f66f98982
--- /dev/null
+++ b/tools/testing/vma/linux/atomic.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_ATOMIC_H
+#define _LINUX_ATOMIC_H
+
+#define atomic_t int32_t
+#define atomic_inc(x) uatomic_inc(x)
+#define atomic_read(x) uatomic_read(x)
+#define atomic_set(x, y) do {} while (0)
+#define U8_MAX UCHAR_MAX
+
+#endif	/* _LINUX_ATOMIC_H */
diff --git a/tools/testing/vma/linux/mmzone.h b/tools/testing/vma/linux/mmzone.h
new file mode 100644
index 000000000000..33cd1517f7a3
--- /dev/null
+++ b/tools/testing/vma/linux/mmzone.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_MMZONE_H
+#define _LINUX_MMZONE_H
+
+#include <linux/atomic.h>
+
+struct pglist_data *first_online_pgdat(void);
+struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
+
+#define for_each_online_pgdat(pgdat)			\
+	for (pgdat = first_online_pgdat();		\
+	     pgdat;					\
+	     pgdat = next_online_pgdat(pgdat))
+
+enum zone_type {
+	__MAX_NR_ZONES
+};
+
+#define MAX_NR_ZONES __MAX_NR_ZONES
+#define MAX_PAGE_ORDER 10
+#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)
+
+#define pageblock_order		MAX_PAGE_ORDER
+#define pageblock_nr_pages	BIT(pageblock_order)
+#define pageblock_align(pfn)	ALIGN((pfn), pageblock_nr_pages)
+#define pageblock_start_pfn(pfn)	ALIGN_DOWN((pfn), pageblock_nr_pages)
+
+struct zone {
+	atomic_long_t		managed_pages;
+};
+
+typedef struct pglist_data {
+	struct zone node_zones[MAX_NR_ZONES];
+
+} pg_data_t;
+
+#endif /* _LINUX_MMZONE_H */
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
new file mode 100644
index 000000000000..48e033c60d87
--- /dev/null
+++ b/tools/testing/vma/vma.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "maple-shared.h"
+#include "vma_internal.h"
+
+/*
+ * Directly import the VMA implementation here. Our vma_internal.h wrapper
+ * provides userland-equivalent functionality for everything vma.c uses.
+ */
+#include "../../../mm/vma.c"
+
+const struct vm_operations_struct vma_dummy_vm_ops;
+
+#define ASSERT_TRUE(_expr)						\
+	do {								\
+		if (!(_expr)) {						\
+			fprintf(stderr,					\
+				"Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \
+				__FILE__, __LINE__, __FUNCTION__, #_expr); \
+			return false;					\
+		}							\
+	} while (0)
+#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr))
+#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2))
+#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2))
+
+static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
+					unsigned long start,
+					unsigned long end,
+					pgoff_t pgoff,
+					vm_flags_t flags)
+{
+	struct vm_area_struct *ret = vm_area_alloc(mm);
+
+	if (ret == NULL)
+		return NULL;
+
+	ret->vm_start = start;
+	ret->vm_end = end;
+	ret->vm_pgoff = pgoff;
+	ret->__vm_flags = flags;
+
+	return ret;
+}
+
+static bool test_simple_merge(void)
+{
+	struct vm_area_struct *vma;
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, flags);
+	struct vm_area_struct *vma_middle = alloc_vma(&mm, 0x1000, 0x2000, 1, flags);
+	struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, flags);
+	VMA_ITERATOR(vmi, &mm, 0x1000);
+
+	ASSERT_FALSE(vma_link(&mm, vma_left));
+	ASSERT_FALSE(vma_link(&mm, vma_middle));
+	ASSERT_FALSE(vma_link(&mm, vma_right));
+
+	vma = vma_merge_new_vma(&vmi, vma_left, vma_middle, 0x1000,
+				0x2000, 1);
+	ASSERT_NE(vma, NULL);
+
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x3000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->vm_flags, flags);
+
+	vm_area_free(vma);
+	mtree_destroy(&mm.mm_mt);
+
+	return true;
+}
+
+static bool test_simple_modify(void)
+{
+	struct vm_area_struct *vma;
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, flags);
+	VMA_ITERATOR(vmi, &mm, 0x1000);
+
+	ASSERT_FALSE(vma_link(&mm, init_vma));
+
+	/*
+	 * The flags will not be changed, the vma_modify_flags() function
+	 * performs the merge/split only.
+	 */
+	vma = vma_modify_flags(&vmi, init_vma, init_vma,
+			       0x1000, 0x2000, VM_READ | VM_MAYREAD);
+	ASSERT_NE(vma, NULL);
+	/* We modify the provided VMA, and on split allocate new VMAs. */
+	ASSERT_EQ(vma, init_vma);
+
+	ASSERT_EQ(vma->vm_start, 0x1000);
+	ASSERT_EQ(vma->vm_end, 0x2000);
+	ASSERT_EQ(vma->vm_pgoff, 1);
+
+	/*
+	 * Now walk through the three split VMAs and make sure they are as
+	 * expected.
+	 */
+
+	vma_iter_set(&vmi, 0);
+	vma = vma_iter_load(&vmi);
+
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x1000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+
+	vm_area_free(vma);
+	vma_iter_clear(&vmi);
+
+	vma = vma_next(&vmi);
+
+	ASSERT_EQ(vma->vm_start, 0x1000);
+	ASSERT_EQ(vma->vm_end, 0x2000);
+	ASSERT_EQ(vma->vm_pgoff, 1);
+
+	vm_area_free(vma);
+	vma_iter_clear(&vmi);
+
+	vma = vma_next(&vmi);
+
+	ASSERT_EQ(vma->vm_start, 0x2000);
+	ASSERT_EQ(vma->vm_end, 0x3000);
+	ASSERT_EQ(vma->vm_pgoff, 2);
+
+	vm_area_free(vma);
+	mtree_destroy(&mm.mm_mt);
+
+	return true;
+}
+
+static bool test_simple_expand(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags);
+	VMA_ITERATOR(vmi, &mm, 0);
+
+	ASSERT_FALSE(vma_link(&mm, vma));
+
+	ASSERT_FALSE(vma_expand(&vmi, vma, 0, 0x3000, 0, NULL));
+
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x3000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+
+	vm_area_free(vma);
+	mtree_destroy(&mm.mm_mt);
+
+	return true;
+}
+
+static bool test_simple_shrink(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, flags);
+	VMA_ITERATOR(vmi, &mm, 0);
+
+	ASSERT_FALSE(vma_link(&mm, vma));
+
+	ASSERT_FALSE(vma_shrink(&vmi, vma, 0, 0x1000, 0));
+
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x1000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+
+	vm_area_free(vma);
+	mtree_destroy(&mm.mm_mt);
+
+	return true;
+}
+
+int main(void)
+{
+	int num_tests = 0, num_fail = 0;
+
+	maple_tree_init();
+
+#define TEST(name)							\
+	do {								\
+		num_tests++;						\
+		if (!test_##name()) {					\
+			num_fail++;					\
+			fprintf(stderr, "Test " #name " FAILED\n");	\
+		}							\
+	} while (0)
+
+	TEST(simple_merge);
+	TEST(simple_modify);
+	TEST(simple_expand);
+	TEST(simple_shrink);
+
+#undef TEST
+
+	printf("%d tests run, %d passed, %d failed.\n",
+	       num_tests, num_tests - num_fail, num_fail);
+
+	return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
new file mode 100644
index 000000000000..093560e5b2ac
--- /dev/null
+++ b/tools/testing/vma/vma_internal.h
@@ -0,0 +1,882 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * vma_internal.h
+ *
+ * Header providing userland wrappers and shims for the functionality provided
+ * by mm/vma_internal.h.
+ *
+ * We make the header guard the same as mm/vma_internal.h, so if this shim
+ * header is included, it precludes the inclusion of the kernel one.
+ */
+
+#ifndef __MM_VMA_INTERNAL_H
+#define __MM_VMA_INTERNAL_H
+
+#define __private
+#define __bitwise
+#define __randomize_layout
+
+#define CONFIG_MMU
+#define CONFIG_PER_VMA_LOCK
+
+#include <stdlib.h>
+
+#include <linux/list.h>
+#include <linux/maple_tree.h>
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+
+#define VM_WARN_ON(_expr) (WARN_ON(_expr))
+#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
+#define VM_BUG_ON(_expr) (BUG_ON(_expr))
+#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr))
+
+#define VM_NONE		0x00000000
+#define VM_READ		0x00000001
+#define VM_WRITE	0x00000002
+#define VM_EXEC		0x00000004
+#define VM_SHARED	0x00000008
+#define VM_MAYREAD	0x00000010
+#define VM_MAYWRITE	0x00000020
+#define VM_GROWSDOWN	0x00000100
+#define VM_PFNMAP	0x00000400
+#define VM_LOCKED	0x00002000
+#define VM_IO           0x00004000
+#define VM_DONTEXPAND	0x00040000
+#define VM_ACCOUNT	0x00100000
+#define VM_MIXEDMAP	0x10000000
+#define VM_STACK	VM_GROWSDOWN
+#define VM_SHADOW_STACK	VM_NONE
+#define VM_SOFTDIRTY	0
+
+#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+
+#define FIRST_USER_ADDRESS	0UL
+#define USER_PGTABLES_CEILING	0UL
+
+#define vma_policy(vma) NULL
+
+#define down_write_nest_lock(sem, nest_lock)
+
+#define pgprot_val(x)		((x).pgprot)
+#define __pgprot(x)		((pgprot_t) { (x) } )
+
+#define for_each_vma(__vmi, __vma)					\
+	while (((__vma) = vma_next(&(__vmi))) != NULL)
+
+/* The MM code likes to work with exclusive end addresses */
+#define for_each_vma_range(__vmi, __vma, __end)				\
+	while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
+
+#define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
+
+#define PHYS_PFN(x)	((unsigned long)((x) >> PAGE_SHIFT))
+
+#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr)
+#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr)
+
+#define TASK_SIZE ((1ul << 47)-PAGE_SIZE)
+
+#define AS_MM_ALL_LOCKS 2
+
+#define current NULL
+
+/* We hardcode this for now. */
+#define sysctl_max_map_count 0x1000000UL
+
+#define pgoff_t unsigned long
+typedef unsigned long	pgprotval_t;
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+typedef unsigned long vm_flags_t;
+typedef __bitwise unsigned int vm_fault_t;
+
+typedef struct refcount_struct {
+	atomic_t refs;
+} refcount_t;
+
+struct kref {
+	refcount_t refcount;
+};
+
+struct anon_vma {
+	struct anon_vma *root;
+	struct rb_root_cached rb_root;
+};
+
+struct anon_vma_chain {
+	struct anon_vma *anon_vma;
+	struct list_head same_vma;
+};
+
+struct anon_vma_name {
+	struct kref kref;
+	/* The name needs to be at the end because it is dynamically sized. */
+	char name[];
+};
+
+struct vma_iterator {
+	struct ma_state mas;
+};
+
+#define VMA_ITERATOR(name, __mm, __addr)				\
+	struct vma_iterator name = {					\
+		.mas = {						\
+			.tree = &(__mm)->mm_mt,				\
+			.index = __addr,				\
+			.node = NULL,					\
+			.status = ma_start,				\
+		},							\
+	}
+
+struct address_space {
+	struct rb_root_cached	i_mmap;
+	unsigned long		flags;
+	atomic_t		i_mmap_writable;
+};
+
+struct vm_userfaultfd_ctx {};
+struct mempolicy {};
+struct mmu_gather {};
+struct mutex {};
+#define DEFINE_MUTEX(mutexname) \
+	struct mutex mutexname = {}
+
+struct mm_struct {
+	struct maple_tree mm_mt;
+	int map_count;			/* number of VMAs */
+	unsigned long total_vm;	   /* Total pages mapped */
+	unsigned long locked_vm;   /* Pages that have PG_mlocked set */
+	unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
+	unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
+	unsigned long stack_vm;	   /* VM_STACK */
+};
+
+struct vma_lock {
+	struct rw_semaphore lock;
+};
+
+
+struct file {
+	struct address_space	*f_mapping;
+};
+
+struct vm_area_struct {
+	/* The first cache line has the info for VMA tree walking. */
+
+	union {
+		struct {
+			/* VMA covers [vm_start; vm_end) addresses within mm */
+			unsigned long vm_start;
+			unsigned long vm_end;
+		};
+#ifdef CONFIG_PER_VMA_LOCK
+		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
+#endif
+	};
+
+	struct mm_struct *vm_mm;	/* The address space we belong to. */
+	pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
+
+	/*
+	 * Flags, see mm.h.
+	 * To modify use vm_flags_{init|reset|set|clear|mod} functions.
+	 */
+	union {
+		const vm_flags_t vm_flags;
+		vm_flags_t __private __vm_flags;
+	};
+
+#ifdef CONFIG_PER_VMA_LOCK
+	/* Flag to indicate areas detached from the mm->mm_mt tree */
+	bool detached;
+
+	/*
+	 * Can only be written (using WRITE_ONCE()) while holding both:
+	 *  - mmap_lock (in write mode)
+	 *  - vm_lock->lock (in write mode)
+	 * Can be read reliably while holding one of:
+	 *  - mmap_lock (in read or write mode)
+	 *  - vm_lock->lock (in read or write mode)
+	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
+	 * while holding nothing (except RCU to keep the VMA struct allocated).
+	 *
+	 * This sequence counter is explicitly allowed to overflow; sequence
+	 * counter reuse can only lead to occasional unnecessary use of the
+	 * slowpath.
+	 */
+	int vm_lock_seq;
+	struct vma_lock *vm_lock;
+#endif
+
+	/*
+	 * For areas with an address space and backing store,
+	 * linkage into the address_space->i_mmap interval tree.
+	 *
+	 */
+	struct {
+		struct rb_node rb;
+		unsigned long rb_subtree_last;
+	} shared;
+
+	/*
+	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
+	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
+	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
+	 * or brk vma (with NULL file) can only be in an anon_vma list.
+	 */
+	struct list_head anon_vma_chain; /* Serialized by mmap_lock &
+					  * page_table_lock */
+	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
+
+	/* Function pointers to deal with this struct. */
+	const struct vm_operations_struct *vm_ops;
+
+	/* Information about our backing store: */
+	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
+					   units */
+	struct file * vm_file;		/* File we map to (can be NULL). */
+	void * vm_private_data;		/* was vm_pte (shared mem) */
+
+#ifdef CONFIG_ANON_VMA_NAME
+	/*
+	 * For private and shared anonymous mappings, a pointer to a null
+	 * terminated string containing the name given to the vma, or NULL if
+	 * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
+	 */
+	struct anon_vma_name *anon_name;
+#endif
+#ifdef CONFIG_SWAP
+	atomic_long_t swap_readahead_info;
+#endif
+#ifndef CONFIG_MMU
+	struct vm_region *vm_region;	/* NOMMU mapping region */
+#endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
+#endif
+	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+} __randomize_layout;
+
+struct vm_fault {};
+
+struct vm_operations_struct {
+	void (*open)(struct vm_area_struct * area);
+	/**
+	 * @close: Called when the VMA is being removed from the MM.
+	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+	 */
+	void (*close)(struct vm_area_struct * area);
+	/* Called any time before splitting to check if it's allowed */
+	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
+	int (*mremap)(struct vm_area_struct *area);
+	/*
+	 * Called by mprotect() to make driver-specific permission
+	 * checks before mprotect() is finalised.   The VMA must not
+	 * be modified.  Returns 0 if mprotect() can proceed.
+	 */
+	int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
+			unsigned long end, unsigned long newflags);
+	vm_fault_t (*fault)(struct vm_fault *vmf);
+	vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
+	vm_fault_t (*map_pages)(struct vm_fault *vmf,
+			pgoff_t start_pgoff, pgoff_t end_pgoff);
+	unsigned long (*pagesize)(struct vm_area_struct * area);
+
+	/* notification that a previously read-only page is about to become
+	 * writable, if an error is returned it will cause a SIGBUS */
+	vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
+
+	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
+	vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
+
+	/* called by access_process_vm when get_user_pages() fails, typically
+	 * for use by special VMAs. See also generic_access_phys() for a generic
+	 * implementation useful for any iomem mapping.
+	 */
+	int (*access)(struct vm_area_struct *vma, unsigned long addr,
+		      void *buf, int len, int write);
+
+	/* Called by the /proc/PID/maps code to ask the vma whether it
+	 * has a special name.  Returning non-NULL will also cause this
+	 * vma to be dumped unconditionally. */
+	const char *(*name)(struct vm_area_struct *vma);
+
+#ifdef CONFIG_NUMA
+	/*
+	 * set_policy() op must add a reference to any non-NULL @new mempolicy
+	 * to hold the policy upon return.  Caller should pass NULL @new to
+	 * remove a policy and fall back to surrounding context--i.e. do not
+	 * install a MPOL_DEFAULT policy, nor the task or system default
+	 * mempolicy.
+	 */
+	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
+
+	/*
+	 * get_policy() op must add reference [mpol_get()] to any policy at
+	 * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
+	 * in mm/mempolicy.c will do this automatically.
+	 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
+	 * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
+	 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
+	 * must return NULL--i.e., do not "fallback" to task or system default
+	 * policy.
+	 */
+	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
+					unsigned long addr, pgoff_t *ilx);
+#endif
+	/*
+	 * Called by vm_normal_page() for special PTEs to find the
+	 * page for @addr.  This is useful if the default behavior
+	 * (using pte_page()) would not find the correct page.
+	 */
+	struct page *(*find_special_page)(struct vm_area_struct *vma,
+					  unsigned long addr);
+};
+
+static inline void vma_iter_invalidate(struct vma_iterator *vmi)
+{
+	mas_pause(&vmi->mas);
+}
+
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+	return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot));
+}
+
+static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
+{
+	return __pgprot(vm_flags);
+}
+
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+		(VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+	return is_shared_maywrite(vma->vm_flags);
+}
+
+static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
+{
+	/*
+	 * Uses mas_find() to get the first VMA when the iterator starts.
+	 * Calling mas_next() could skip the first entry.
+	 */
+	return mas_find(&vmi->mas, ULONG_MAX);
+}
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+	vma->vm_lock = calloc(1, sizeof(struct vma_lock));
+
+	if (!vma->vm_lock)
+		return false;
+
+	init_rwsem(&vma->vm_lock->lock);
+	vma->vm_lock_seq = -1;
+
+	return true;
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *);
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+	/* When detaching vma should be write-locked */
+	if (detached)
+		vma_assert_write_locked(vma);
+	vma->detached = detached;
+}
+
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+
+static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
+{
+	memset(vma, 0, sizeof(*vma));
+	vma->vm_mm = mm;
+	vma->vm_ops = &vma_dummy_vm_ops;
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_mark_detached(vma, false);
+}
+
+static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct));
+
+	if (!vma)
+		return NULL;
+
+	vma_init(vma, mm);
+	if (!vma_lock_alloc(vma)) {
+		free(vma);
+		return NULL;
+	}
+
+	return vma;
+}
+
+static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+	struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct));
+
+	if (!new)
+		return NULL;
+
+	memcpy(new, orig, sizeof(*new));
+	if (!vma_lock_alloc(new)) {
+		free(new);
+		return NULL;
+	}
+	INIT_LIST_HEAD(&new->anon_vma_chain);
+
+	return new;
+}
+
+/*
+ * These are defined in vma.h, but sadly vm_stat_account() is referenced by
+ * kernel/fork.c, so we have to these broadly available there, and temporarily
+ * define them here to resolve the dependency cycle.
+ */
+
+#define is_exec_mapping(flags) \
+	((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC)
+
+#define is_stack_mapping(flags) \
+	(((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK))
+
+#define is_data_mapping(flags) \
+	((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE)
+
+static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags,
+				   long npages)
+{
+	WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
+
+	if (is_exec_mapping(flags))
+		mm->exec_vm += npages;
+	else if (is_stack_mapping(flags))
+		mm->stack_vm += npages;
+	else if (is_data_mapping(flags))
+		mm->data_vm += npages;
+}
+
+#undef is_exec_mapping
+#undef is_stack_mapping
+#undef is_data_mapping
+
+/* Currently stubbed but we may later wish to un-stub. */
+static inline void vm_acct_memory(long pages);
+static inline void vm_unacct_memory(long pages)
+{
+	vm_acct_memory(-pages);
+}
+
+static inline void mapping_allow_writable(struct address_space *mapping)
+{
+	atomic_inc(&mapping->i_mmap_writable);
+}
+
+static inline void vma_set_range(struct vm_area_struct *vma,
+				 unsigned long start, unsigned long end,
+				 pgoff_t pgoff)
+{
+	vma->vm_start = start;
+	vma->vm_end = end;
+	vma->vm_pgoff = pgoff;
+}
+
+static inline
+struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+{
+	return mas_find(&vmi->mas, max - 1);
+}
+
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+			unsigned long start, unsigned long end, gfp_t gfp)
+{
+	__mas_set_range(&vmi->mas, start, end - 1);
+	mas_store_gfp(&vmi->mas, NULL, gfp);
+	if (unlikely(mas_is_err(&vmi->mas)))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void mmap_assert_locked(struct mm_struct *);
+static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+						unsigned long start_addr,
+						unsigned long end_addr)
+{
+	unsigned long index = start_addr;
+
+	mmap_assert_locked(mm);
+	return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+
+static inline
+struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+{
+	return mtree_load(&mm->mm_mt, addr);
+}
+
+static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
+{
+	return mas_prev(&vmi->mas, 0);
+}
+
+static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
+{
+	mas_set(&vmi->mas, addr);
+}
+
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+	return !vma->vm_ops;
+}
+
+/* Defined in vma.h, so temporarily define here to avoid circular dependency. */
+#define vma_iter_load(vmi) \
+	mas_walk(&(vmi)->mas)
+
+static inline struct vm_area_struct *
+find_vma_prev(struct mm_struct *mm, unsigned long addr,
+			struct vm_area_struct **pprev)
+{
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, addr);
+
+	vma = vma_iter_load(&vmi);
+	*pprev = vma_prev(&vmi);
+	if (!vma)
+		vma = vma_next(&vmi);
+	return vma;
+}
+
+#undef vma_iter_load
+
+static inline void vma_iter_init(struct vma_iterator *vmi,
+		struct mm_struct *mm, unsigned long addr)
+{
+	mas_init(&vmi->mas, &mm->mm_mt, addr);
+}
+
+/* Stubbed functions. */
+
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+					struct vm_userfaultfd_ctx vm_ctx)
+{
+	return true;
+}
+
+static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+				    struct anon_vma_name *anon_name2)
+{
+	return true;
+}
+
+static inline void might_sleep(void)
+{
+}
+
+static inline unsigned long vma_pages(struct vm_area_struct *vma)
+{
+	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+}
+
+static inline void fput(struct file *)
+{
+}
+
+static inline void mpol_put(struct mempolicy *)
+{
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+	free(vma->vm_lock);
+}
+
+static inline void __vm_area_free(struct vm_area_struct *vma)
+{
+	vma_lock_free(vma);
+	free(vma);
+}
+
+static inline void vm_area_free(struct vm_area_struct *vma)
+{
+	__vm_area_free(vma);
+}
+
+static inline void lru_add_drain(void)
+{
+}
+
+static inline void tlb_gather_mmu(struct mmu_gather *, struct mm_struct *)
+{
+}
+
+static inline void update_hiwater_rss(struct mm_struct *)
+{
+}
+
+static inline void update_hiwater_vm(struct mm_struct *)
+{
+}
+
+static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
+		      struct vm_area_struct *vma, unsigned long start_addr,
+		      unsigned long end_addr, unsigned long tree_end,
+		      bool mm_wr_locked)
+{
+	(void)tlb;
+	(void)mas;
+	(void)vma;
+	(void)start_addr;
+	(void)end_addr;
+	(void)tree_end;
+	(void)mm_wr_locked;
+}
+
+static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
+		   struct vm_area_struct *vma, unsigned long floor,
+		   unsigned long ceiling, bool mm_wr_locked)
+{
+	(void)tlb;
+	(void)mas;
+	(void)vma;
+	(void)floor;
+	(void)ceiling;
+	(void)mm_wr_locked;
+}
+
+static inline void mapping_unmap_writable(struct address_space *)
+{
+}
+
+static inline void flush_dcache_mmap_lock(struct address_space *)
+{
+}
+
+static inline void tlb_finish_mmu(struct mmu_gather *)
+{
+}
+
+static inline void get_file(struct file *)
+{
+}
+
+static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *)
+{
+	return 0;
+}
+
+static inline int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *)
+{
+	return 0;
+}
+
+static inline void vma_start_write(struct vm_area_struct *)
+{
+}
+
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+					 unsigned long start,
+					 unsigned long end,
+					 long adjust_next)
+{
+	(void)vma;
+	(void)start;
+	(void)end;
+	(void)adjust_next;
+}
+
+static inline void vma_iter_free(struct vma_iterator *vmi)
+{
+	mas_destroy(&vmi->mas);
+}
+
+static inline void vm_acct_memory(long pages)
+{
+}
+
+static inline void vma_interval_tree_insert(struct vm_area_struct *,
+					    struct rb_root_cached *)
+{
+}
+
+static inline void vma_interval_tree_remove(struct vm_area_struct *,
+					    struct rb_root_cached *)
+{
+}
+
+static inline void flush_dcache_mmap_unlock(struct address_space *)
+{
+}
+
+static inline void anon_vma_interval_tree_insert(struct anon_vma_chain*,
+						 struct rb_root_cached *)
+{
+}
+
+static inline void anon_vma_interval_tree_remove(struct anon_vma_chain*,
+						 struct rb_root_cached *)
+{
+}
+
+static inline void uprobe_mmap(struct vm_area_struct *)
+{
+}
+
+static inline void uprobe_munmap(struct vm_area_struct *vma,
+				 unsigned long start, unsigned long end)
+{
+	(void)vma;
+	(void)start;
+	(void)end;
+}
+
+static inline void i_mmap_lock_write(struct address_space *)
+{
+}
+
+static inline void anon_vma_lock_write(struct anon_vma *)
+{
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *)
+{
+}
+
+static inline void unlink_anon_vmas(struct vm_area_struct *)
+{
+}
+
+static inline void anon_vma_unlock_write(struct anon_vma *)
+{
+}
+
+static inline void i_mmap_unlock_write(struct address_space *)
+{
+}
+
+static inline void anon_vma_merge(struct vm_area_struct *,
+				  struct vm_area_struct *)
+{
+}
+
+static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+					 unsigned long start,
+					 unsigned long end,
+					 struct list_head *unmaps)
+{
+	(void)vma;
+	(void)start;
+	(void)end;
+	(void)unmaps;
+
+	return 0;
+}
+
+static inline void mmap_write_downgrade(struct mm_struct *)
+{
+}
+
+static inline void mmap_read_unlock(struct mm_struct *)
+{
+}
+
+static inline void mmap_write_unlock(struct mm_struct *)
+{
+}
+
+static inline bool can_modify_mm(struct mm_struct *mm,
+				 unsigned long start,
+				 unsigned long end)
+{
+	(void)mm;
+	(void)start;
+	(void)end;
+
+	return true;
+}
+
+static inline void arch_unmap(struct mm_struct *mm,
+				 unsigned long start,
+				 unsigned long end)
+{
+	(void)mm;
+	(void)start;
+	(void)end;
+}
+
+static inline void mmap_assert_locked(struct mm_struct *)
+{
+}
+
+static inline bool mpol_equal(struct mempolicy *, struct mempolicy *)
+{
+	return true;
+}
+
+static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
+			  unsigned long vm_flags)
+{
+	(void)vma;
+	(void)vm_flags;
+}
+
+static inline bool mapping_can_writeback(struct address_space *)
+{
+	return true;
+}
+
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *)
+{
+	return false;
+}
+
+static inline bool vma_soft_dirty_enabled(struct vm_area_struct *)
+{
+	return false;
+}
+
+static inline bool userfaultfd_wp(struct vm_area_struct *)
+{
+	return false;
+}
+
+static inline void mmap_assert_write_locked(struct mm_struct *)
+{
+}
+
+static inline void mutex_lock(struct mutex *)
+{
+}
+
+static inline void mutex_unlock(struct mutex *)
+{
+}
+
+static inline bool mutex_is_locked(struct mutex *)
+{
+	return true;
+}
+
+static inline bool signal_pending(void *)
+{
+	return false;
+}
+
+#endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From 67203f3f2a63d429272f0c80451e5fcc469fdb46 Mon Sep 17 00:00:00 2001
From: Pedro Falcato <pedro.falcato@gmail.com>
Date: Wed, 7 Aug 2024 18:33:36 +0100
Subject: selftests/mm: add mseal test for no-discard madvise

Add an mseal test for madvise() operations that aren't considered
"discard" (e.g purely advisory ops such as MADV_RANDOM).

[pedro.falcato@gmail.com: adjust the mseal test's plan]
  Link: https://lkml.kernel.org/r/20240807203724.2686144-1-pedro.falcato@gmail.com
Link: https://lkml.kernel.org/r/20240807173336.2523757-3-pedro.falcato@gmail.com
Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
Tested-by: Jeff Xu <jeffxu@chromium.org>
Reviewed-by: Jeff Xu <jeffxu@chromium.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mseal_test.c | 36 ++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c
index a818f010de47..7eec3f0152e3 100644
--- a/tools/testing/selftests/mm/mseal_test.c
+++ b/tools/testing/selftests/mm/mseal_test.c
@@ -1731,6 +1731,38 @@ static void test_seal_discard_ro_anon(bool seal)
 	REPORT_TEST_PASS();
 }
 
+static void test_seal_madvise_nodiscard(bool seal)
+{
+	void *ptr;
+	unsigned long page_size = getpagesize();
+	unsigned long size = 4 * page_size;
+	int ret;
+
+	setup_single_address(size, &ptr);
+	FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+	if (seal) {
+		ret = seal_single_address(ptr, size);
+		FAIL_TEST_IF_FALSE(!ret);
+	}
+
+	/*
+	 * Test a random madvise flag like MADV_RANDOM that does not touch page
+	 * contents (and thus should work for msealed VMAs). RANDOM also happens to
+	 * share bits with other discard-ish flags like REMOVE.
+	 */
+	ret = sys_madvise(ptr, size, MADV_RANDOM);
+	FAIL_TEST_IF_FALSE(!ret);
+
+	ret = sys_munmap(ptr, size);
+	if (seal)
+		FAIL_TEST_IF_FALSE(ret < 0);
+	else
+		FAIL_TEST_IF_FALSE(!ret);
+
+	REPORT_TEST_PASS();
+}
+
 int main(int argc, char **argv)
 {
 	bool test_seal = seal_support();
@@ -1743,7 +1775,7 @@ int main(int argc, char **argv)
 	if (!pkey_supported())
 		ksft_print_msg("PKEY not supported\n");
 
-	ksft_set_plan(80);
+	ksft_set_plan(82);
 
 	test_seal_addseal();
 	test_seal_unmapped_start();
@@ -1822,6 +1854,8 @@ int main(int argc, char **argv)
 	test_seal_mremap_move_fixed_zero(true);
 	test_seal_mremap_move_dontunmap_anyaddr(false);
 	test_seal_mremap_move_dontunmap_anyaddr(true);
+	test_seal_madvise_nodiscard(false);
+	test_seal_madvise_nodiscard(true);
 	test_seal_discard_ro_anon(false);
 	test_seal_discard_ro_anon(true);
 	test_seal_discard_ro_anon_on_rw(false);
-- 
cgit v1.2.3


From 072cd213b75eb01fcf40eff898f8d5c008ce1457 Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@chromium.org>
Date: Wed, 7 Aug 2024 21:23:20 +0000
Subject: selftest mm/mseal: fix test_seal_mremap_move_dontunmap_anyaddr

the syscall remap accepts following:

mremap(src, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, dst)

when the src is sealed, the call will fail with error code:
EPERM

Previously, the test uses hard-coded 0xdeaddead as dst, and it
will fail on the system with newer glibc installed.

This patch removes test's dependency on glibc for mremap(), also
fix the test and remove the hardcoded address.

Link: https://lkml.kernel.org/r/20240807212320.2831848-1-jeffxu@chromium.org
Fixes: 4926c7a52de7 ("selftest mm/mseal memory sealing")
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
Reported-by: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mseal_test.c | 57 +++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c
index 7eec3f0152e3..bd0bfda7aae7 100644
--- a/tools/testing/selftests/mm/mseal_test.c
+++ b/tools/testing/selftests/mm/mseal_test.c
@@ -110,6 +110,16 @@ static int sys_madvise(void *start, size_t len, int types)
 	return sret;
 }
 
+static void *sys_mremap(void *addr, size_t old_len, size_t new_len,
+	unsigned long flags, void *new_addr)
+{
+	void *sret;
+
+	errno = 0;
+	sret = (void *) syscall(__NR_mremap, addr, old_len, new_len, flags, new_addr);
+	return sret;
+}
+
 static int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
 {
 	int ret = syscall(__NR_pkey_alloc, flags, init_val);
@@ -1115,12 +1125,12 @@ static void test_seal_mremap_shrink(bool seal)
 	}
 
 	/* shrink from 4 pages to 2 pages. */
-	ret2 = mremap(ptr, size, 2 * page_size, 0, 0);
+	ret2 = sys_mremap(ptr, size, 2 * page_size, 0, 0);
 	if (seal) {
-		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+		FAIL_TEST_IF_FALSE(ret2 == (void *) MAP_FAILED);
 		FAIL_TEST_IF_FALSE(errno == EPERM);
 	} else {
-		FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
+		FAIL_TEST_IF_FALSE(ret2 != (void *) MAP_FAILED);
 
 	}
 
@@ -1147,7 +1157,7 @@ static void test_seal_mremap_expand(bool seal)
 	}
 
 	/* expand from 2 page to 4 pages. */
-	ret2 = mremap(ptr, 2 * page_size, 4 * page_size, 0, 0);
+	ret2 = sys_mremap(ptr, 2 * page_size, 4 * page_size, 0, 0);
 	if (seal) {
 		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
 		FAIL_TEST_IF_FALSE(errno == EPERM);
@@ -1180,7 +1190,7 @@ static void test_seal_mremap_move(bool seal)
 	}
 
 	/* move from ptr to fixed address. */
-	ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr);
+	ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr);
 	if (seal) {
 		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
 		FAIL_TEST_IF_FALSE(errno == EPERM);
@@ -1299,7 +1309,7 @@ static void test_seal_mremap_shrink_fixed(bool seal)
 	}
 
 	/* mremap to move and shrink to fixed address */
-	ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED,
+	ret2 = sys_mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED,
 			newAddr);
 	if (seal) {
 		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
@@ -1330,7 +1340,7 @@ static void test_seal_mremap_expand_fixed(bool seal)
 	}
 
 	/* mremap to move and expand to fixed address */
-	ret2 = mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED,
+	ret2 = sys_mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED,
 			newAddr);
 	if (seal) {
 		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
@@ -1361,7 +1371,7 @@ static void test_seal_mremap_move_fixed(bool seal)
 	}
 
 	/* mremap to move to fixed address */
-	ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr);
+	ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr);
 	if (seal) {
 		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
 		FAIL_TEST_IF_FALSE(errno == EPERM);
@@ -1390,14 +1400,13 @@ static void test_seal_mremap_move_fixed_zero(bool seal)
 	/*
 	 * MREMAP_FIXED can move the mapping to zero address
 	 */
-	ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED,
+	ret2 = sys_mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED,
 			0);
 	if (seal) {
 		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
 		FAIL_TEST_IF_FALSE(errno == EPERM);
 	} else {
 		FAIL_TEST_IF_FALSE(ret2 == 0);
-
 	}
 
 	REPORT_TEST_PASS();
@@ -1420,13 +1429,13 @@ static void test_seal_mremap_move_dontunmap(bool seal)
 	}
 
 	/* mremap to move, and don't unmap src addr. */
-	ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0);
+	ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0);
 	if (seal) {
 		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
 		FAIL_TEST_IF_FALSE(errno == EPERM);
 	} else {
+		/* kernel will allocate a new address */
 		FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
-
 	}
 
 	REPORT_TEST_PASS();
@@ -1434,7 +1443,7 @@ static void test_seal_mremap_move_dontunmap(bool seal)
 
 static void test_seal_mremap_move_dontunmap_anyaddr(bool seal)
 {
-	void *ptr;
+	void *ptr, *ptr2;
 	unsigned long page_size = getpagesize();
 	unsigned long size = 4 * page_size;
 	int ret;
@@ -1449,24 +1458,30 @@ static void test_seal_mremap_move_dontunmap_anyaddr(bool seal)
 	}
 
 	/*
-	 * The 0xdeaddead should not have effect on dest addr
-	 * when MREMAP_DONTUNMAP is set.
+	 * The new address is any address that not allocated.
+	 * use allocate/free to similate that.
 	 */
-	ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP,
-			0xdeaddead);
+	setup_single_address(size, &ptr2);
+	FAIL_TEST_IF_FALSE(ptr2 != (void *)-1);
+	ret = sys_munmap(ptr2, size);
+	FAIL_TEST_IF_FALSE(!ret);
+
+	/*
+	 * remap to any address.
+	 */
+	ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP,
+			(void *) ptr2);
 	if (seal) {
 		FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
 		FAIL_TEST_IF_FALSE(errno == EPERM);
 	} else {
-		FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
-		FAIL_TEST_IF_FALSE((long)ret2 != 0xdeaddead);
-
+		/* remap success and return ptr2 */
+		FAIL_TEST_IF_FALSE(ret2 ==  ptr2);
 	}
 
 	REPORT_TEST_PASS();
 }
 
-
 static void test_seal_merge_and_split(void)
 {
 	void *ptr;
-- 
cgit v1.2.3


From 5adfeaecc487e7023f1c7bbdc081707d7a93110f Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 9 Aug 2024 14:48:51 +0300
Subject: mm: rework accept memory helpers

Make accept_memory() and range_contains_unaccepted_memory() take 'start'
and 'size' arguments instead of 'start' and 'end'.

Remove accept_page(), replacing it with direct calls to accept_memory().
The accept_page() name is going to be used for a different function.

Link: https://lkml.kernel.org/r/20240809114854.3745464-6-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/boot/compressed/misc.c                  |  2 +-
 arch/x86/boot/compressed/misc.h                  |  2 +-
 drivers/firmware/efi/libstub/efistub.h           |  2 +-
 drivers/firmware/efi/libstub/unaccepted_memory.c |  3 ++-
 drivers/firmware/efi/unaccepted_memory.c         | 18 ++++++++++--------
 include/linux/mm.h                               | 12 +++++-------
 mm/memblock.c                                    |  2 +-
 mm/mm_init.c                                     |  2 +-
 mm/page_alloc.c                                  | 19 +++----------------
 tools/testing/memblock/internal.h                |  2 +-
 10 files changed, 26 insertions(+), 38 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 944454306ef4..04a35b2c26e9 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -511,7 +511,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
 
 	if (init_unaccepted_memory()) {
 		debug_putstr("Accepting memory... ");
-		accept_memory(__pa(output), __pa(output) + needed_size);
+		accept_memory(__pa(output), needed_size);
 	}
 
 	entry_offset = decompress_kernel(output, virt_addr, error);
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index b353a7be380c..dd8d1a85f671 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -256,6 +256,6 @@ static inline bool init_unaccepted_memory(void) { return false; }
 
 /* Defined in EFI stub */
 extern struct efi_unaccepted_memory *unaccepted_table;
-void accept_memory(phys_addr_t start, phys_addr_t end);
+void accept_memory(phys_addr_t start, unsigned long size);
 
 #endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index d33ccbc4a2c6..685098f9626f 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -1229,7 +1229,7 @@ efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab);
 efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc,
 					struct efi_boot_memmap *map);
 void process_unaccepted_memory(u64 start, u64 end);
-void accept_memory(phys_addr_t start, phys_addr_t end);
+void accept_memory(phys_addr_t start, unsigned long size);
 void arch_accept_memory(phys_addr_t start, phys_addr_t end);
 
 #endif
diff --git a/drivers/firmware/efi/libstub/unaccepted_memory.c b/drivers/firmware/efi/libstub/unaccepted_memory.c
index c295ea3a6efc..757dbe734a47 100644
--- a/drivers/firmware/efi/libstub/unaccepted_memory.c
+++ b/drivers/firmware/efi/libstub/unaccepted_memory.c
@@ -177,9 +177,10 @@ void process_unaccepted_memory(u64 start, u64 end)
 		   start / unit_size, (end - start) / unit_size);
 }
 
-void accept_memory(phys_addr_t start, phys_addr_t end)
+void accept_memory(phys_addr_t start, unsigned long size)
 {
 	unsigned long range_start, range_end;
+	phys_addr_t end = start + size;
 	unsigned long bitmap_size;
 	u64 unit_size;
 
diff --git a/drivers/firmware/efi/unaccepted_memory.c b/drivers/firmware/efi/unaccepted_memory.c
index 50f6503fe49f..c2c067eff634 100644
--- a/drivers/firmware/efi/unaccepted_memory.c
+++ b/drivers/firmware/efi/unaccepted_memory.c
@@ -30,11 +30,12 @@ static LIST_HEAD(accepting_list);
  *  - memory that is below phys_base;
  *  - memory that is above the memory that addressable by the bitmap;
  */
-void accept_memory(phys_addr_t start, phys_addr_t end)
+void accept_memory(phys_addr_t start, unsigned long size)
 {
 	struct efi_unaccepted_memory *unaccepted;
 	unsigned long range_start, range_end;
 	struct accept_range range, *entry;
+	phys_addr_t end = start + size;
 	unsigned long flags;
 	u64 unit_size;
 
@@ -74,13 +75,13 @@ void accept_memory(phys_addr_t start, phys_addr_t end)
 	 * "guard" page is accepted in addition to the memory that needs to be
 	 * used:
 	 *
-	 * 1. Implicitly extend the range_contains_unaccepted_memory(start, end)
-	 *    checks up to end+unit_size if 'end' is aligned on a unit_size
-	 *    boundary.
+	 * 1. Implicitly extend the range_contains_unaccepted_memory(start, size)
+	 *    checks up to the next unit_size if 'start+size' is aligned on a
+	 *    unit_size boundary.
 	 *
-	 * 2. Implicitly extend accept_memory(start, end) to end+unit_size if
-	 *    'end' is aligned on a unit_size boundary. (immediately following
-	 *    this comment)
+	 * 2. Implicitly extend accept_memory(start, size) to the next unit_size
+	 *    if 'size+end' is aligned on a unit_size boundary. (immediately
+	 *    following this comment)
 	 */
 	if (!(end % unit_size))
 		end += unit_size;
@@ -156,9 +157,10 @@ retry:
 	spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
 }
 
-bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end)
+bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size)
 {
 	struct efi_unaccepted_memory *unaccepted;
+	phys_addr_t end = start + size;
 	unsigned long flags;
 	bool ret = false;
 	u64 unit_size;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ee8cea73d415..b1eed30fdc06 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4062,18 +4062,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 
 #ifdef CONFIG_UNACCEPTED_MEMORY
 
-bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
-void accept_memory(phys_addr_t start, phys_addr_t end);
+bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
+void accept_memory(phys_addr_t start, unsigned long size);
 
 #else
 
 static inline bool range_contains_unaccepted_memory(phys_addr_t start,
-						    phys_addr_t end)
+						    unsigned long size)
 {
 	return false;
 }
 
-static inline void accept_memory(phys_addr_t start, phys_addr_t end)
+static inline void accept_memory(phys_addr_t start, unsigned long size)
 {
 }
 
@@ -4081,9 +4081,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
 
 static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
 {
-	phys_addr_t paddr = pfn << PAGE_SHIFT;
-
-	return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+	return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
 }
 
 void vma_pgtable_walk_begin(struct vm_area_struct *vma);
diff --git a/mm/memblock.c b/mm/memblock.c
index 3b9dc2d89b8a..0a77a748a8eb 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1500,7 +1500,7 @@ done:
 	 *
 	 * Accept the memory of the allocated buffer.
 	 */
-	accept_memory(found, found + size);
+	accept_memory(found, size);
 
 	return found;
 }
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 51960079875b..f3106b6299d3 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1939,7 +1939,7 @@ static void __init deferred_free_pages(unsigned long pfn,
 	}
 
 	/* Accept chunks smaller than MAX_PAGE_ORDER upfront */
-	accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
+	accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE);
 
 	for (i = 0; i < nr_pages; i++, page++, pfn++) {
 		if (pageblock_aligned(pfn))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fec7ed5a04f2..6ba88929c1e3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -286,7 +286,6 @@ EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
 static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static void accept_page(struct page *page, unsigned int order);
 static bool cond_accept_memory(struct zone *zone, unsigned int order);
 static inline bool has_unaccepted_memory(void);
 static bool __free_unaccepted(struct page *page);
@@ -1268,7 +1267,7 @@ void __meminit __free_pages_core(struct page *page, unsigned int order,
 		if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
 			return;
 
-		accept_page(page, order);
+		accept_memory(page_to_phys(page), PAGE_SIZE << order);
 	}
 
 	/*
@@ -6932,16 +6931,8 @@ early_param("accept_memory", accept_memory_parse);
 static bool page_contains_unaccepted(struct page *page, unsigned int order)
 {
 	phys_addr_t start = page_to_phys(page);
-	phys_addr_t end = start + (PAGE_SIZE << order);
 
-	return range_contains_unaccepted_memory(start, end);
-}
-
-static void accept_page(struct page *page, unsigned int order)
-{
-	phys_addr_t start = page_to_phys(page);
-
-	accept_memory(start, start + (PAGE_SIZE << order));
+	return range_contains_unaccepted_memory(start, PAGE_SIZE << order);
 }
 
 static bool try_to_accept_memory_one(struct zone *zone)
@@ -6966,7 +6957,7 @@ static bool try_to_accept_memory_one(struct zone *zone)
 	__ClearPageUnaccepted(page);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-	accept_page(page, MAX_PAGE_ORDER);
+	accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
 
 	__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
 
@@ -7038,10 +7029,6 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
 	return false;
 }
 
-static void accept_page(struct page *page, unsigned int order)
-{
-}
-
 static bool cond_accept_memory(struct zone *zone, unsigned int order)
 {
 	return false;
diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h
index f6c6e5474c3a..1cf82acb2a3e 100644
--- a/tools/testing/memblock/internal.h
+++ b/tools/testing/memblock/internal.h
@@ -20,7 +20,7 @@ void memblock_free_pages(struct page *page, unsigned long pfn,
 {
 }
 
-static inline void accept_memory(phys_addr_t start, phys_addr_t end)
+static inline void accept_memory(phys_addr_t start, unsigned long size)
 {
 }
 
-- 
cgit v1.2.3


From 617f8e4d76b81361884db852005f519012ddd244 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Mon, 12 Aug 2024 15:05:43 -0400
Subject: maple_tree: add test to replicate low memory race conditions

Add new callback fields to the userspace implementation of struct
kmem_cache.  This allows for executing callback functions in order to
further test low memory scenarios where node allocation is retried.

This callback can help test race conditions by calling a function when a
low memory event is tested.

Link: https://lkml.kernel.org/r/20240812190543.71967-2-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/maple_tree.c                 | 13 +++++++++
 tools/testing/radix-tree/maple.c | 63 ++++++++++++++++++++++++++++++++++++++++
 tools/testing/shared/linux.c     | 26 ++++++++++++++++-
 3 files changed, 101 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index b547ff211ac7..14d7864b8d53 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -7005,6 +7005,19 @@ void mt_set_non_kernel(unsigned int val)
 	kmem_cache_set_non_kernel(maple_node_cache, val);
 }
 
+extern void kmem_cache_set_callback(struct kmem_cache *cachep,
+		void (*callback)(void *));
+void mt_set_callback(void (*callback)(void *))
+{
+	kmem_cache_set_callback(maple_node_cache, callback);
+}
+
+extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private);
+void mt_set_private(void *private)
+{
+	kmem_cache_set_private(maple_node_cache, private);
+}
+
 extern unsigned long kmem_cache_get_alloc(struct kmem_cache *);
 unsigned long mt_get_alloc_size(void)
 {
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index cd1cf05503b4..ef5b83cf94ea 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -36224,6 +36224,65 @@ static noinline void __init check_mtree_dup(struct maple_tree *mt)
 
 extern void test_kmem_cache_bulk(void);
 
+/* callback function used for check_nomem_writer_race() */
+static void writer2(void *maple_tree)
+{
+	struct maple_tree *mt = (struct maple_tree *)maple_tree;
+	MA_STATE(mas, mt, 6, 10);
+
+	mtree_lock(mas.tree);
+	mas_store(&mas, xa_mk_value(0xC));
+	mas_destroy(&mas);
+	mtree_unlock(mas.tree);
+}
+
+/*
+ * check_nomem_writer_race() - test a possible race in the mas_nomem() path
+ * @mt: The tree to build.
+ *
+ * There is a possible race condition in low memory conditions when mas_nomem()
+ * gives up its lock. A second writer can chagne the entry that the primary
+ * writer executing the mas_nomem() path is modifying. This test recreates this
+ * scenario to ensure we are handling it correctly.
+ */
+static void check_nomem_writer_race(struct maple_tree *mt)
+{
+	MA_STATE(mas, mt, 0, 5);
+
+	mt_set_non_kernel(0);
+	/* setup root with 2 values with NULL in between */
+	mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL);
+	mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL);
+	mtree_store_range(mt, 11, 15, xa_mk_value(0xB), GFP_KERNEL);
+
+	/* setup writer 2 that will trigger the race condition */
+	mt_set_private(mt);
+	mt_set_callback(writer2);
+
+	mtree_lock(mt);
+	/* erase 0-5 */
+	mas_erase(&mas);
+
+	/* index 6-10 should retain the value from writer 2 */
+	check_load(mt, 6, xa_mk_value(0xC));
+	mtree_unlock(mt);
+
+	/* test for the same race but with mas_store_gfp() */
+	mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL);
+	mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL);
+
+	mas_set_range(&mas, 0, 5);
+	mtree_lock(mt);
+	mas_store_gfp(&mas, NULL, GFP_KERNEL);
+
+	/* ensure write made by writer 2 is retained */
+	check_load(mt, 6, xa_mk_value(0xC));
+
+	mt_set_private(NULL);
+	mt_set_callback(NULL);
+	mtree_unlock(mt);
+}
+
 void farmer_tests(void)
 {
 	struct maple_node *node;
@@ -36257,6 +36316,10 @@ void farmer_tests(void)
 	check_dfs_preorder(&tree);
 	mtree_destroy(&tree);
 
+	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU);
+	check_nomem_writer_race(&tree);
+	mtree_destroy(&tree);
+
 	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
 	check_prealloc(&tree);
 	mtree_destroy(&tree);
diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c
index 4eb442206d01..17263696b5d8 100644
--- a/tools/testing/shared/linux.c
+++ b/tools/testing/shared/linux.c
@@ -26,8 +26,21 @@ struct kmem_cache {
 	unsigned int non_kernel;
 	unsigned long nr_allocated;
 	unsigned long nr_tallocated;
+	bool exec_callback;
+	void (*callback)(void *);
+	void *private;
 };
 
+void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *))
+{
+	cachep->callback = callback;
+}
+
+void kmem_cache_set_private(struct kmem_cache *cachep, void *private)
+{
+	cachep->private = private;
+}
+
 void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val)
 {
 	cachep->non_kernel = val;
@@ -58,9 +71,17 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
 {
 	void *p;
 
+	if (cachep->exec_callback) {
+		if (cachep->callback)
+			cachep->callback(cachep->private);
+		cachep->exec_callback = false;
+	}
+
 	if (!(gfp & __GFP_DIRECT_RECLAIM)) {
-		if (!cachep->non_kernel)
+		if (!cachep->non_kernel) {
+			cachep->exec_callback = true;
 			return NULL;
+		}
 
 		cachep->non_kernel--;
 	}
@@ -223,6 +244,9 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align,
 	ret->objs = NULL;
 	ret->ctor = ctor;
 	ret->non_kernel = 0;
+	ret->exec_callback = false;
+	ret->callback = NULL;
+	ret->private = NULL;
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5d659bbb52a24f91cc6c0cf546ffcc0faa7e8f1a Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Wed, 14 Aug 2024 12:19:31 -0400
Subject: maple_tree: introduce mas_wr_store_type()

Introduce mas_wr_store_type() which will set the correct store type based
on a walk of the tree.  In mas_wr_node_store() the <= min_slots condition
is changed to < as if new_end is = to mt_min_slots then there is not
enough room.

mas_prealloc_calc() is also introduced to abstract the calculation used to
determine the number of nodes needed for a store operation.

In this change a call to mas_reset() is removed in the error case of
mas_prealloc().  This is only needed in the MA_STATE_REBALANCE case of
mas_destroy().  We can move the call to mas_reset() directly to
mas_destroy().

Also, add a test case to validate the order that we check the store type
in is correct.  This test models a vma expanding and then shrinking which
is part of the boot process.

Link: https://lkml.kernel.org/r/20240814161944.55347-5-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/maple_tree.c                 | 217 ++++++++++++++++++++++++++++-----------
 tools/testing/radix-tree/maple.c |  36 +++++++
 2 files changed, 193 insertions(+), 60 deletions(-)

(limited to 'tools')

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index de4a91ced8ca..d0b9b3795b96 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -1372,9 +1372,9 @@ retry:
 			return NULL;
 		}
 
+		mas->node = NULL;
 		/* empty tree */
 		if (unlikely(!root)) {
-			mas->node = NULL;
 			mas->status = ma_none;
 			mas->offset = MAPLE_NODE_SLOTS;
 			return NULL;
@@ -3890,7 +3890,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas,
 	bool in_rcu = mt_in_rcu(mas->tree);
 
 	/* Check if there is enough data. The room is enough. */
-	if (!mte_is_root(mas->node) && (new_end <= mt_min_slots[wr_mas->type]) &&
+	if (!mte_is_root(mas->node) && (new_end < mt_min_slots[wr_mas->type]) &&
 	    !(mas->mas_flags & MA_STATE_BULK))
 		return false;
 
@@ -4275,6 +4275,146 @@ static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas)
 	wr_mas->content = mas_start(mas);
 }
 
+/**
+ * mas_prealloc_calc() - Calculate number of nodes needed for a
+ * given store oepration
+ * @mas: The maple state
+ * @entry: The entry to store into the tree
+ *
+ * Return: Number of nodes required for preallocation.
+ */
+static inline int mas_prealloc_calc(struct ma_state *mas, void *entry)
+{
+	int ret = mas_mt_height(mas) * 3 + 1;
+
+	switch (mas->store_type) {
+	case wr_invalid:
+		WARN_ON_ONCE(1);
+		break;
+	case wr_new_root:
+		ret = 1;
+		break;
+	case wr_store_root:
+		if (likely((mas->last != 0) || (mas->index != 0)))
+			ret = 1;
+		else if (((unsigned long) (entry) & 3) == 2)
+			ret = 1;
+		else
+			ret = 0;
+		break;
+	case wr_spanning_store:
+		ret =  mas_mt_height(mas) * 3 + 1;
+		break;
+	case wr_split_store:
+		ret =  mas_mt_height(mas) * 2 + 1;
+		break;
+	case wr_rebalance:
+		ret =  mas_mt_height(mas) * 2 - 1;
+		break;
+	case wr_node_store:
+		ret = mt_in_rcu(mas->tree) ? 1 : 0;
+		break;
+	case wr_append:
+	case wr_exact_fit:
+	case wr_slot_store:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * mas_wr_store_type() - Set the store type for a given
+ * store operation.
+ * @wr_mas: The maple write state
+ */
+static inline void mas_wr_store_type(struct ma_wr_state *wr_mas)
+{
+	struct ma_state *mas = wr_mas->mas;
+	unsigned char new_end;
+
+	if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) {
+		mas->store_type = wr_store_root;
+		return;
+	}
+
+	if (unlikely(!mas_wr_walk(wr_mas))) {
+		mas->store_type = wr_spanning_store;
+		return;
+	}
+
+	/* At this point, we are at the leaf node that needs to be altered. */
+	mas_wr_end_piv(wr_mas);
+	if (!wr_mas->entry)
+		mas_wr_extend_null(wr_mas);
+
+	new_end = mas_wr_new_end(wr_mas);
+	if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) {
+		mas->store_type = wr_exact_fit;
+		return;
+	}
+
+	if (unlikely(!mas->index && mas->last == ULONG_MAX)) {
+		mas->store_type = wr_new_root;
+		return;
+	}
+
+	/* Potential spanning rebalance collapsing a node */
+	if (new_end < mt_min_slots[wr_mas->type]) {
+		if (!mte_is_root(mas->node)) {
+			mas->store_type = wr_rebalance;
+			return;
+		}
+		mas->store_type = wr_node_store;
+		return;
+	}
+
+	if (new_end >= mt_slots[wr_mas->type]) {
+		mas->store_type = wr_split_store;
+		return;
+	}
+
+	if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) {
+		mas->store_type = wr_append;
+		return;
+	}
+
+	if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) ||
+		(wr_mas->offset_end - mas->offset == 1))) {
+		mas->store_type = wr_slot_store;
+		return;
+	}
+
+	if (mte_is_root(mas->node) || (new_end >= mt_min_slots[wr_mas->type]) ||
+		(mas->mas_flags & MA_STATE_BULK)) {
+		mas->store_type = wr_node_store;
+		return;
+	}
+
+	mas->store_type = wr_invalid;
+	MAS_WARN_ON(mas, 1);
+}
+
+/**
+ * mas_wr_preallocate() - Preallocate enough nodes for a store operation
+ * @wr_mas: The maple write state
+ * @entry: The entry that will be stored
+ *
+ */
+static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry)
+{
+	struct ma_state *mas = wr_mas->mas;
+	int request;
+
+	mas_wr_prealloc_setup(wr_mas);
+	mas_wr_store_type(wr_mas);
+	request = mas_prealloc_calc(mas, entry);
+	if (!request)
+		return;
+
+	mas_node_count(mas, request);
+}
+
 /**
  * mas_insert() - Internal call to insert a value
  * @mas: The maple state
@@ -5508,69 +5648,25 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc);
 int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
 {
 	MA_WR_STATE(wr_mas, mas, entry);
-	unsigned char node_size;
-	int request = 1;
-	int ret;
-
-
-	if (unlikely(!mas->index && mas->last == ULONG_MAX))
-		goto ask_now;
+	int ret = 0;
+	int request;
 
 	mas_wr_prealloc_setup(&wr_mas);
-	/* Root expand */
-	if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
-		goto ask_now;
-
-	if (unlikely(!mas_wr_walk(&wr_mas))) {
-		/* Spanning store, use worst case for now */
-		request = 1 + mas_mt_height(mas) * 3;
-		goto ask_now;
-	}
-
-	/* At this point, we are at the leaf node that needs to be altered. */
-	/* Exact fit, no nodes needed. */
-	if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last)
-		return 0;
-
-	mas_wr_end_piv(&wr_mas);
-	node_size = mas_wr_new_end(&wr_mas);
-
-	/* Slot store, does not require additional nodes */
-	if (node_size == mas->end) {
-		/* reuse node */
-		if (!mt_in_rcu(mas->tree))
-			return 0;
-		/* shifting boundary */
-		if (wr_mas.offset_end - mas->offset == 1)
-			return 0;
-	}
+	mas_wr_store_type(&wr_mas);
+	request = mas_prealloc_calc(mas, entry);
+	if (!request)
+		return ret;
 
-	if (node_size >= mt_slots[wr_mas.type]) {
-		/* Split, worst case for now. */
-		request = 1 + mas_mt_height(mas) * 2;
-		goto ask_now;
+	mas_node_count_gfp(mas, request, gfp);
+	if (mas_is_err(mas)) {
+		mas_set_alloc_req(mas, 0);
+		ret = xa_err(mas->node);
+		mas_destroy(mas);
+		mas_reset(mas);
+		return ret;
 	}
 
-	/* New root needs a single node */
-	if (unlikely(mte_is_root(mas->node)))
-		goto ask_now;
-
-	/* Potential spanning rebalance collapsing a node, use worst-case */
-	if (node_size  - 1 <= mt_min_slots[wr_mas.type])
-		request = mas_mt_height(mas) * 2 - 1;
-
-	/* node store, slot store needs one node */
-ask_now:
-	mas_node_count_gfp(mas, request, gfp);
 	mas->mas_flags |= MA_STATE_PREALLOC;
-	if (likely(!mas_is_err(mas)))
-		return 0;
-
-	mas_set_alloc_req(mas, 0);
-	ret = xa_err(mas->node);
-	mas_reset(mas);
-	mas_destroy(mas);
-	mas_reset(mas);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mas_preallocate);
@@ -5596,7 +5692,8 @@ void mas_destroy(struct ma_state *mas)
 	 */
 	if (mas->mas_flags & MA_STATE_REBALANCE) {
 		unsigned char end;
-
+		if (mas_is_err(mas))
+			mas_reset(mas);
 		mas_start(mas);
 		mtree_range_walk(mas);
 		end = mas->end + 1;
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index ef5b83cf94ea..ad42a36231fb 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -36283,6 +36283,38 @@ static void check_nomem_writer_race(struct maple_tree *mt)
 	mtree_unlock(mt);
 }
 
+ /* test to simulate expanding a vma from [0x7fffffffe000, 0x7ffffffff000)
+  * to [0x7ffde4ca1000, 0x7ffffffff000) and then shrinking the vma to
+  * [0x7ffde4ca1000, 0x7ffde4ca2000)
+  */
+static inline int check_vma_modification(struct maple_tree *mt)
+{
+	MA_STATE(mas, mt, 0, 0);
+
+	mtree_lock(mt);
+	/* vma with old start and old end */
+	__mas_set_range(&mas, 0x7fffffffe000, 0x7ffffffff000 - 1);
+	mas_preallocate(&mas, xa_mk_value(1), GFP_KERNEL);
+	mas_store_prealloc(&mas, xa_mk_value(1));
+
+	/* next write occurs partly in previous range [0, 0x7fffffffe000)*/
+	mas_prev_range(&mas, 0);
+	/* expand vma to {0x7ffde4ca1000, 0x7ffffffff000) */
+	__mas_set_range(&mas, 0x7ffde4ca1000, 0x7ffffffff000 - 1);
+	mas_preallocate(&mas, xa_mk_value(1), GFP_KERNEL);
+	mas_store_prealloc(&mas, xa_mk_value(1));
+
+	/* shrink vma to [0x7ffde4ca1000, 7ffde4ca2000) */
+	__mas_set_range(&mas, 0x7ffde4ca2000, 0x7ffffffff000 - 1);
+	mas_preallocate(&mas, NULL, GFP_KERNEL);
+	mas_store_prealloc(&mas, NULL);
+	mt_dump(mt, mt_dump_hex);
+
+	mas_destroy(&mas);
+	mtree_unlock(mt);
+	return 0;
+}
+
 void farmer_tests(void)
 {
 	struct maple_node *node;
@@ -36290,6 +36322,10 @@ void farmer_tests(void)
 
 	mt_dump(&tree, mt_dump_dec);
 
+	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU);
+	check_vma_modification(&tree);
+	mtree_destroy(&tree);
+
 	tree.ma_root = xa_mk_value(0);
 	mt_dump(&tree, mt_dump_dec);
 
-- 
cgit v1.2.3


From 3cd9e92e009de9a036cbf9ec27bad33367fca6f3 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Wed, 14 Aug 2024 12:19:32 -0400
Subject: maple_tree: remove mas_destroy() from mas_nomem()

Separate call to mas_destroy() from mas_nomem() so we can check for no
memory errors without destroying the current maple state in
mas_store_gfp().  We then add calls to mas_destroy() to callers of
mas_nomem().

Link: https://lkml.kernel.org/r/20240814161944.55347-6-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/maple_tree.c                 | 37 ++++++++++++++++++++++++-------------
 tools/testing/radix-tree/maple.c | 10 ++++++----
 2 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index d0b9b3795b96..58985107cf00 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4519,6 +4519,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
 	if (*next == 0)
 		mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED;
 
+	mas_destroy(mas);
 	return ret;
 }
 EXPORT_SYMBOL(mas_alloc_cyclic);
@@ -5601,21 +5602,25 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp)
 	unsigned long index = mas->index;
 	unsigned long last = mas->last;
 	MA_WR_STATE(wr_mas, mas, entry);
+	int ret = 0;
 
-	mas_wr_store_setup(&wr_mas);
-	trace_ma_write(__func__, mas, 0, entry);
 retry:
-	mas_wr_store_entry(&wr_mas);
+	mas_wr_preallocate(&wr_mas, entry);
 	if (unlikely(mas_nomem(mas, gfp))) {
 		if (!entry)
 			__mas_set_range(mas, index, last);
 		goto retry;
 	}
 
-	if (unlikely(mas_is_err(mas)))
-		return xa_err(mas->node);
+	if (mas_is_err(mas)) {
+		ret = xa_err(mas->node);
+		goto out;
+	}
 
-	return 0;
+	mas_wr_store_entry(&wr_mas);
+out:
+	mas_destroy(mas);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(mas_store_gfp);
 
@@ -6374,6 +6379,7 @@ write_retry:
 		goto write_retry;
 	}
 
+	mas_destroy(mas);
 	return entry;
 }
 EXPORT_SYMBOL_GPL(mas_erase);
@@ -6388,10 +6394,8 @@ EXPORT_SYMBOL_GPL(mas_erase);
 bool mas_nomem(struct ma_state *mas, gfp_t gfp)
 	__must_hold(mas->tree->ma_lock)
 {
-	if (likely(mas->node != MA_ERROR(-ENOMEM))) {
-		mas_destroy(mas);
+	if (likely(mas->node != MA_ERROR(-ENOMEM)))
 		return false;
-	}
 
 	if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) {
 		mtree_unlock(mas->tree);
@@ -6469,6 +6473,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index,
 {
 	MA_STATE(mas, mt, index, last);
 	MA_WR_STATE(wr_mas, &mas, entry);
+	int ret = 0;
 
 	trace_ma_write(__func__, &mas, 0, entry);
 	if (WARN_ON_ONCE(xa_is_advanced(entry)))
@@ -6484,10 +6489,12 @@ retry:
 		goto retry;
 
 	mtree_unlock(mt);
+
 	if (mas_is_err(&mas))
-		return xa_err(mas.node);
+		ret = xa_err(mas.node);
 
-	return 0;
+	mas_destroy(&mas);
+	return ret;
 }
 EXPORT_SYMBOL(mtree_store_range);
 
@@ -6523,6 +6530,7 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first,
 		unsigned long last, void *entry, gfp_t gfp)
 {
 	MA_STATE(ms, mt, first, last);
+	int ret = 0;
 
 	if (WARN_ON_ONCE(xa_is_advanced(entry)))
 		return -EINVAL;
@@ -6538,9 +6546,10 @@ retry:
 
 	mtree_unlock(mt);
 	if (mas_is_err(&ms))
-		return xa_err(ms.node);
+		ret = xa_err(ms.node);
 
-	return 0;
+	mas_destroy(&ms);
+	return ret;
 }
 EXPORT_SYMBOL(mtree_insert_range);
 
@@ -6595,6 +6604,7 @@ retry:
 
 unlock:
 	mtree_unlock(mt);
+	mas_destroy(&mas);
 	return ret;
 }
 EXPORT_SYMBOL(mtree_alloc_range);
@@ -6676,6 +6686,7 @@ retry:
 
 unlock:
 	mtree_unlock(mt);
+	mas_destroy(&mas);
 	return ret;
 }
 EXPORT_SYMBOL(mtree_alloc_rrange);
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index ad42a36231fb..c5b00aca9def 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -120,7 +120,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.alloc->slot[0] == NULL);
 	mas_push_node(&mas, mn);
 	mas_reset(&mas);
-	mas_nomem(&mas, GFP_KERNEL); /* free */
+	mas_destroy(&mas);
 	mtree_unlock(mt);
 
 
@@ -144,7 +144,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	mn->parent = ma_parent_ptr(mn);
 	ma_free_rcu(mn);
 	mas.status = ma_start;
-	mas_nomem(&mas, GFP_KERNEL);
+	mas_destroy(&mas);
 	/* Allocate 3 nodes, will fail. */
 	mas_node_count(&mas, 3);
 	/* Drop the lock and allocate 3 nodes. */
@@ -161,7 +161,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_allocated(&mas) != 3);
 	/* Free. */
 	mas_reset(&mas);
-	mas_nomem(&mas, GFP_KERNEL);
+	mas_destroy(&mas);
 
 	/* Set allocation request to 1. */
 	mas_set_alloc_req(&mas, 1);
@@ -277,6 +277,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 		}
 		mas_reset(&mas);
 		MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL));
+		mas_destroy(&mas);
 
 	}
 
@@ -299,7 +300,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	}
 	MT_BUG_ON(mt, mas_allocated(&mas) != total);
 	mas_reset(&mas);
-	mas_nomem(&mas, GFP_KERNEL); /* Free. */
+	mas_destroy(&mas); /* Free. */
 
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
 	for (i = 1; i < 128; i++) {
@@ -35847,6 +35848,7 @@ static noinline void __init check_nomem(struct maple_tree *mt)
 	mas_store(&ms, &ms); /* insert 1 -> &ms */
 	mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */
 	mtree_unlock(mt);
+	mas_destroy(&ms);
 	mtree_destroy(mt);
 }
 
-- 
cgit v1.2.3


From 11ee88a0f98770719f29b7d1efb2a2ca6a83af3c Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 29 Jul 2024 01:45:08 -0700
Subject: fault-injection: enhance failcmd to exit on non-hex address input

The failcmd.sh script in the fault-injection toolkit does not currently
validate whether the provided address is in hexadecimal format.  This can
lead to silent failures if the address is sourced from places like
`/proc/kallsyms`, which omits the '0x' prefix, potentially causing users
to operate under incorrect assumptions.

Introduce a new function, `exit_if_not_hex`, which checks the format of
the provided address and exits with an error message if the address is not
a valid hexadecimal number.

This enhancement prevents users from running the command with improperly
formatted addresses, thus improving the robustness and usability of the
failcmd tool.

Link: https://lkml.kernel.org/r/20240729084512.3349928-1-leitao@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/fault-injection/failcmd.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/fault-injection/failcmd.sh b/tools/testing/fault-injection/failcmd.sh
index 78dac34264be..c4f2432750f4 100644
--- a/tools/testing/fault-injection/failcmd.sh
+++ b/tools/testing/fault-injection/failcmd.sh
@@ -64,6 +64,14 @@ ENVIRONMENT
 EOF
 }
 
+exit_if_not_hex() {
+    local value="$1"
+    if ! [[ $value =~ ^0x[0-9a-fA-F]+$ ]]; then
+        echo "Error: The provided value '$value' is not a valid hexadecimal number." >&2
+        exit 1
+    fi
+}
+
 if [ $UID != 0 ]; then
 	echo must be run as root >&2
 	exit 1
@@ -160,18 +168,22 @@ while true; do
 		shift 2
 		;;
 	--require-start)
+		exit_if_not_hex "$2"
 		echo $2 > $FAULTATTR/require-start
 		shift 2
 		;;
 	--require-end)
+		exit_if_not_hex "$2"
 		echo $2 > $FAULTATTR/require-end
 		shift 2
 		;;
 	--reject-start)
+		exit_if_not_hex "$2"
 		echo $2 > $FAULTATTR/reject-start
 		shift 2
 		;;
 	--reject-end)
+		exit_if_not_hex "$2"
 		echo $2 > $FAULTATTR/reject-end
 		shift 2
 		;;
-- 
cgit v1.2.3


From 8af2caf7307d8002d88c377c58eba8cae75a2109 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 29 Jul 2024 01:52:11 -0700
Subject: failcmd: make failcmd.sh executable

Change the file permissions of tools/testing/fault-injection/failcmd.sh to
allow execution.  This ensures the script can be run directly without
explicitly invoking a shell.

Link: https://lkml.kernel.org/r/20240729085215.3403417-1-leitao@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/fault-injection/failcmd.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tools/testing/fault-injection/failcmd.sh

(limited to 'tools')

diff --git a/tools/testing/fault-injection/failcmd.sh b/tools/testing/fault-injection/failcmd.sh
old mode 100644
new mode 100755
-- 
cgit v1.2.3


From 931a36c4138ac418d487bd4db0d03780b46a77ba Mon Sep 17 00:00:00 2001
From: zhangjiao <zhangjiao2@cmss.chinamobile.com>
Date: Thu, 29 Aug 2024 14:29:42 +0800
Subject: tools: gpio: rm .*.cmd on make clean

rm .*.cmd when calling make clean

Signed-off-by: zhangjiao <zhangjiao2@cmss.chinamobile.com>
Link: https://lore.kernel.org/r/20240829062942.11487-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 tools/gpio/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile
index d29c9c49e251..ed565eb52275 100644
--- a/tools/gpio/Makefile
+++ b/tools/gpio/Makefile
@@ -78,7 +78,7 @@ $(OUTPUT)gpio-watch: $(GPIO_WATCH_IN)
 clean:
 	rm -f $(ALL_PROGRAMS)
 	rm -f $(OUTPUT)include/linux/gpio.h
-	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete
+	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete
 
 install: $(ALL_PROGRAMS)
 	install -d -m 755 $(DESTDIR)$(bindir);		\
-- 
cgit v1.2.3


From 0e7eb23668948585f3f0ea8c6249338f33fde872 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 30 Aug 2024 19:53:47 -0300
Subject: perf tools: Build x86 32-bit syscall table from
 arch/x86/entry/syscalls/syscall_32.tbl

To remove one more use of the audit libs and address a problem reported
with a recent change where a function isn't available when using the
audit libs method, that should really go away, this being one step in
that direction.

The script used to generate the 64-bit syscall table was already
parametrized to generate for both 64-bit and 32-bit, so just use it and
wire the generated table to the syscalltbl.c routines.

Reported-by: Jiri Slaby <jirislaby@kernel.org>
Suggested-by: Ian Rogers <irogers@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Tested-by: Jiri Slaby <jirislaby@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/6fe63fa3-6c63-4b75-ac09-884d26f6fb95@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config                        |  13 +-
 tools/perf/arch/x86/Makefile                      |   6 +-
 tools/perf/arch/x86/entry/syscalls/syscall_32.tbl | 470 ++++++++++++++++++++++
 tools/perf/check-headers.sh                       |   1 +
 tools/perf/util/syscalltbl.c                      |   4 +
 5 files changed, 484 insertions(+), 10 deletions(-)
 create mode 100644 tools/perf/arch/x86/entry/syscalls/syscall_32.tbl

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 4eb1fc897baf..9998cd7a8792 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -31,14 +31,8 @@ $(call detected_var,SRCARCH)
 ifneq ($(NO_SYSCALL_TABLE),1)
   NO_SYSCALL_TABLE := 1
 
-  ifeq ($(SRCARCH),x86)
-    ifeq (${IS_64_BIT}, 1)
-      NO_SYSCALL_TABLE := 0
-    endif
-  else
-    ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390 mips loongarch))
-      NO_SYSCALL_TABLE := 0
-    endif
+  ifeq ($(SRCARCH),$(filter $(SRCARCH),x86 powerpc arm64 s390 mips loongarch))
+    NO_SYSCALL_TABLE := 0
   endif
 
   ifneq ($(NO_SYSCALL_TABLE),1)
@@ -55,8 +49,9 @@ endif
 # Additional ARCH settings for x86
 ifeq ($(SRCARCH),x86)
   $(call detected,CONFIG_X86)
+  CFLAGS += -I$(OUTPUT)arch/x86/include/generated
   ifeq (${IS_64_BIT}, 1)
-    CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -I$(OUTPUT)arch/x86/include/generated
+    CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT
     ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
     LIBUNWIND_LIBS = -lunwind-x86_64 -lunwind -llzma
     $(call detected,CONFIG_X86_64)
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index 8952e00f9b60..67b4969a6738 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -13,6 +13,7 @@ PERF_HAVE_JITDUMP := 1
 generated := $(OUTPUT)arch/x86/include/generated
 out       := $(generated)/asm
 header    := $(out)/syscalls_64.c
+header_32 := $(out)/syscalls_32.c
 sys       := $(srctree)/tools/perf/arch/x86/entry/syscalls
 systbl    := $(sys)/syscalltbl.sh
 
@@ -22,7 +23,10 @@ $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 $(header): $(sys)/syscall_64.tbl $(systbl)
 	$(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@
 
+$(header_32): $(sys)/syscall_32.tbl $(systbl)
+	$(Q)$(SHELL) '$(systbl)' $(sys)/syscall_32.tbl 'x86' > $@
+
 clean::
 	$(call QUIET_CLEAN, x86) $(RM) -r $(header) $(generated)
 
-archheaders: $(header)
+archheaders: $(header) $(header_32)
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl
new file mode 100644
index 000000000000..534c74b14fab
--- /dev/null
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl
@@ -0,0 +1,470 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# 32-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
+#
+# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for
+# sys_*() system calls and compat_sys_*() compat system calls if
+# IA32_EMULATION is defined, and expect struct pt_regs *regs as their only
+# parameter.
+#
+# The abi is always "i386" for this file.
+#
+0	i386	restart_syscall		sys_restart_syscall
+1	i386	exit			sys_exit			-			noreturn
+2	i386	fork			sys_fork
+3	i386	read			sys_read
+4	i386	write			sys_write
+5	i386	open			sys_open			compat_sys_open
+6	i386	close			sys_close
+7	i386	waitpid			sys_waitpid
+8	i386	creat			sys_creat
+9	i386	link			sys_link
+10	i386	unlink			sys_unlink
+11	i386	execve			sys_execve			compat_sys_execve
+12	i386	chdir			sys_chdir
+13	i386	time			sys_time32
+14	i386	mknod			sys_mknod
+15	i386	chmod			sys_chmod
+16	i386	lchown			sys_lchown16
+17	i386	break
+18	i386	oldstat			sys_stat
+19	i386	lseek			sys_lseek			compat_sys_lseek
+20	i386	getpid			sys_getpid
+21	i386	mount			sys_mount
+22	i386	umount			sys_oldumount
+23	i386	setuid			sys_setuid16
+24	i386	getuid			sys_getuid16
+25	i386	stime			sys_stime32
+26	i386	ptrace			sys_ptrace			compat_sys_ptrace
+27	i386	alarm			sys_alarm
+28	i386	oldfstat		sys_fstat
+29	i386	pause			sys_pause
+30	i386	utime			sys_utime32
+31	i386	stty
+32	i386	gtty
+33	i386	access			sys_access
+34	i386	nice			sys_nice
+35	i386	ftime
+36	i386	sync			sys_sync
+37	i386	kill			sys_kill
+38	i386	rename			sys_rename
+39	i386	mkdir			sys_mkdir
+40	i386	rmdir			sys_rmdir
+41	i386	dup			sys_dup
+42	i386	pipe			sys_pipe
+43	i386	times			sys_times			compat_sys_times
+44	i386	prof
+45	i386	brk			sys_brk
+46	i386	setgid			sys_setgid16
+47	i386	getgid			sys_getgid16
+48	i386	signal			sys_signal
+49	i386	geteuid			sys_geteuid16
+50	i386	getegid			sys_getegid16
+51	i386	acct			sys_acct
+52	i386	umount2			sys_umount
+53	i386	lock
+54	i386	ioctl			sys_ioctl			compat_sys_ioctl
+55	i386	fcntl			sys_fcntl			compat_sys_fcntl64
+56	i386	mpx
+57	i386	setpgid			sys_setpgid
+58	i386	ulimit
+59	i386	oldolduname		sys_olduname
+60	i386	umask			sys_umask
+61	i386	chroot			sys_chroot
+62	i386	ustat			sys_ustat			compat_sys_ustat
+63	i386	dup2			sys_dup2
+64	i386	getppid			sys_getppid
+65	i386	getpgrp			sys_getpgrp
+66	i386	setsid			sys_setsid
+67	i386	sigaction		sys_sigaction			compat_sys_sigaction
+68	i386	sgetmask		sys_sgetmask
+69	i386	ssetmask		sys_ssetmask
+70	i386	setreuid		sys_setreuid16
+71	i386	setregid		sys_setregid16
+72	i386	sigsuspend		sys_sigsuspend
+73	i386	sigpending		sys_sigpending			compat_sys_sigpending
+74	i386	sethostname		sys_sethostname
+75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit
+76	i386	getrlimit		sys_old_getrlimit		compat_sys_old_getrlimit
+77	i386	getrusage		sys_getrusage			compat_sys_getrusage
+78	i386	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday
+79	i386	settimeofday		sys_settimeofday		compat_sys_settimeofday
+80	i386	getgroups		sys_getgroups16
+81	i386	setgroups		sys_setgroups16
+82	i386	select			sys_old_select			compat_sys_old_select
+83	i386	symlink			sys_symlink
+84	i386	oldlstat		sys_lstat
+85	i386	readlink		sys_readlink
+86	i386	uselib			sys_uselib
+87	i386	swapon			sys_swapon
+88	i386	reboot			sys_reboot
+89	i386	readdir			sys_old_readdir			compat_sys_old_readdir
+90	i386	mmap			sys_old_mmap			compat_sys_ia32_mmap
+91	i386	munmap			sys_munmap
+92	i386	truncate		sys_truncate			compat_sys_truncate
+93	i386	ftruncate		sys_ftruncate			compat_sys_ftruncate
+94	i386	fchmod			sys_fchmod
+95	i386	fchown			sys_fchown16
+96	i386	getpriority		sys_getpriority
+97	i386	setpriority		sys_setpriority
+98	i386	profil
+99	i386	statfs			sys_statfs			compat_sys_statfs
+100	i386	fstatfs			sys_fstatfs			compat_sys_fstatfs
+101	i386	ioperm			sys_ioperm
+102	i386	socketcall		sys_socketcall			compat_sys_socketcall
+103	i386	syslog			sys_syslog
+104	i386	setitimer		sys_setitimer			compat_sys_setitimer
+105	i386	getitimer		sys_getitimer			compat_sys_getitimer
+106	i386	stat			sys_newstat			compat_sys_newstat
+107	i386	lstat			sys_newlstat			compat_sys_newlstat
+108	i386	fstat			sys_newfstat			compat_sys_newfstat
+109	i386	olduname		sys_uname
+110	i386	iopl			sys_iopl
+111	i386	vhangup			sys_vhangup
+112	i386	idle
+113	i386	vm86old			sys_vm86old			sys_ni_syscall
+114	i386	wait4			sys_wait4			compat_sys_wait4
+115	i386	swapoff			sys_swapoff
+116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
+117	i386	ipc			sys_ipc				compat_sys_ipc
+118	i386	fsync			sys_fsync
+119	i386	sigreturn		sys_sigreturn			compat_sys_sigreturn
+120	i386	clone			sys_clone			compat_sys_ia32_clone
+121	i386	setdomainname		sys_setdomainname
+122	i386	uname			sys_newuname
+123	i386	modify_ldt		sys_modify_ldt
+124	i386	adjtimex		sys_adjtimex_time32
+125	i386	mprotect		sys_mprotect
+126	i386	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
+127	i386	create_module
+128	i386	init_module		sys_init_module
+129	i386	delete_module		sys_delete_module
+130	i386	get_kernel_syms
+131	i386	quotactl		sys_quotactl
+132	i386	getpgid			sys_getpgid
+133	i386	fchdir			sys_fchdir
+134	i386	bdflush			sys_ni_syscall
+135	i386	sysfs			sys_sysfs
+136	i386	personality		sys_personality
+137	i386	afs_syscall
+138	i386	setfsuid		sys_setfsuid16
+139	i386	setfsgid		sys_setfsgid16
+140	i386	_llseek			sys_llseek
+141	i386	getdents		sys_getdents			compat_sys_getdents
+142	i386	_newselect		sys_select			compat_sys_select
+143	i386	flock			sys_flock
+144	i386	msync			sys_msync
+145	i386	readv			sys_readv
+146	i386	writev			sys_writev
+147	i386	getsid			sys_getsid
+148	i386	fdatasync		sys_fdatasync
+149	i386	_sysctl			sys_ni_syscall
+150	i386	mlock			sys_mlock
+151	i386	munlock			sys_munlock
+152	i386	mlockall		sys_mlockall
+153	i386	munlockall		sys_munlockall
+154	i386	sched_setparam		sys_sched_setparam
+155	i386	sched_getparam		sys_sched_getparam
+156	i386	sched_setscheduler	sys_sched_setscheduler
+157	i386	sched_getscheduler	sys_sched_getscheduler
+158	i386	sched_yield		sys_sched_yield
+159	i386	sched_get_priority_max	sys_sched_get_priority_max
+160	i386	sched_get_priority_min	sys_sched_get_priority_min
+161	i386	sched_rr_get_interval	sys_sched_rr_get_interval_time32
+162	i386	nanosleep		sys_nanosleep_time32
+163	i386	mremap			sys_mremap
+164	i386	setresuid		sys_setresuid16
+165	i386	getresuid		sys_getresuid16
+166	i386	vm86			sys_vm86			sys_ni_syscall
+167	i386	query_module
+168	i386	poll			sys_poll
+169	i386	nfsservctl
+170	i386	setresgid		sys_setresgid16
+171	i386	getresgid		sys_getresgid16
+172	i386	prctl			sys_prctl
+173	i386	rt_sigreturn		sys_rt_sigreturn		compat_sys_rt_sigreturn
+174	i386	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
+175	i386	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
+176	i386	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
+177	i386	rt_sigtimedwait		sys_rt_sigtimedwait_time32	compat_sys_rt_sigtimedwait_time32
+178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
+179	i386	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
+180	i386	pread64			sys_ia32_pread64
+181	i386	pwrite64		sys_ia32_pwrite64
+182	i386	chown			sys_chown16
+183	i386	getcwd			sys_getcwd
+184	i386	capget			sys_capget
+185	i386	capset			sys_capset
+186	i386	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack
+187	i386	sendfile		sys_sendfile			compat_sys_sendfile
+188	i386	getpmsg
+189	i386	putpmsg
+190	i386	vfork			sys_vfork
+191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
+192	i386	mmap2			sys_mmap_pgoff
+193	i386	truncate64		sys_ia32_truncate64
+194	i386	ftruncate64		sys_ia32_ftruncate64
+195	i386	stat64			sys_stat64			compat_sys_ia32_stat64
+196	i386	lstat64			sys_lstat64			compat_sys_ia32_lstat64
+197	i386	fstat64			sys_fstat64			compat_sys_ia32_fstat64
+198	i386	lchown32		sys_lchown
+199	i386	getuid32		sys_getuid
+200	i386	getgid32		sys_getgid
+201	i386	geteuid32		sys_geteuid
+202	i386	getegid32		sys_getegid
+203	i386	setreuid32		sys_setreuid
+204	i386	setregid32		sys_setregid
+205	i386	getgroups32		sys_getgroups
+206	i386	setgroups32		sys_setgroups
+207	i386	fchown32		sys_fchown
+208	i386	setresuid32		sys_setresuid
+209	i386	getresuid32		sys_getresuid
+210	i386	setresgid32		sys_setresgid
+211	i386	getresgid32		sys_getresgid
+212	i386	chown32			sys_chown
+213	i386	setuid32		sys_setuid
+214	i386	setgid32		sys_setgid
+215	i386	setfsuid32		sys_setfsuid
+216	i386	setfsgid32		sys_setfsgid
+217	i386	pivot_root		sys_pivot_root
+218	i386	mincore			sys_mincore
+219	i386	madvise			sys_madvise
+220	i386	getdents64		sys_getdents64
+221	i386	fcntl64			sys_fcntl64			compat_sys_fcntl64
+# 222 is unused
+# 223 is unused
+224	i386	gettid			sys_gettid
+225	i386	readahead		sys_ia32_readahead
+226	i386	setxattr		sys_setxattr
+227	i386	lsetxattr		sys_lsetxattr
+228	i386	fsetxattr		sys_fsetxattr
+229	i386	getxattr		sys_getxattr
+230	i386	lgetxattr		sys_lgetxattr
+231	i386	fgetxattr		sys_fgetxattr
+232	i386	listxattr		sys_listxattr
+233	i386	llistxattr		sys_llistxattr
+234	i386	flistxattr		sys_flistxattr
+235	i386	removexattr		sys_removexattr
+236	i386	lremovexattr		sys_lremovexattr
+237	i386	fremovexattr		sys_fremovexattr
+238	i386	tkill			sys_tkill
+239	i386	sendfile64		sys_sendfile64
+240	i386	futex			sys_futex_time32
+241	i386	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
+242	i386	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
+243	i386	set_thread_area		sys_set_thread_area
+244	i386	get_thread_area		sys_get_thread_area
+245	i386	io_setup		sys_io_setup			compat_sys_io_setup
+246	i386	io_destroy		sys_io_destroy
+247	i386	io_getevents		sys_io_getevents_time32
+248	i386	io_submit		sys_io_submit			compat_sys_io_submit
+249	i386	io_cancel		sys_io_cancel
+250	i386	fadvise64		sys_ia32_fadvise64
+# 251 is available for reuse (was briefly sys_set_zone_reclaim)
+252	i386	exit_group		sys_exit_group			-			noreturn
+253	i386	lookup_dcookie
+254	i386	epoll_create		sys_epoll_create
+255	i386	epoll_ctl		sys_epoll_ctl
+256	i386	epoll_wait		sys_epoll_wait
+257	i386	remap_file_pages	sys_remap_file_pages
+258	i386	set_tid_address		sys_set_tid_address
+259	i386	timer_create		sys_timer_create		compat_sys_timer_create
+260	i386	timer_settime		sys_timer_settime32
+261	i386	timer_gettime		sys_timer_gettime32
+262	i386	timer_getoverrun	sys_timer_getoverrun
+263	i386	timer_delete		sys_timer_delete
+264	i386	clock_settime		sys_clock_settime32
+265	i386	clock_gettime		sys_clock_gettime32
+266	i386	clock_getres		sys_clock_getres_time32
+267	i386	clock_nanosleep		sys_clock_nanosleep_time32
+268	i386	statfs64		sys_statfs64			compat_sys_statfs64
+269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
+270	i386	tgkill			sys_tgkill
+271	i386	utimes			sys_utimes_time32
+272	i386	fadvise64_64		sys_ia32_fadvise64_64
+273	i386	vserver
+274	i386	mbind			sys_mbind
+275	i386	get_mempolicy		sys_get_mempolicy
+276	i386	set_mempolicy		sys_set_mempolicy
+277	i386	mq_open			sys_mq_open			compat_sys_mq_open
+278	i386	mq_unlink		sys_mq_unlink
+279	i386	mq_timedsend		sys_mq_timedsend_time32
+280	i386	mq_timedreceive		sys_mq_timedreceive_time32
+281	i386	mq_notify		sys_mq_notify			compat_sys_mq_notify
+282	i386	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
+283	i386	kexec_load		sys_kexec_load			compat_sys_kexec_load
+284	i386	waitid			sys_waitid			compat_sys_waitid
+# 285 sys_setaltroot
+286	i386	add_key			sys_add_key
+287	i386	request_key		sys_request_key
+288	i386	keyctl			sys_keyctl			compat_sys_keyctl
+289	i386	ioprio_set		sys_ioprio_set
+290	i386	ioprio_get		sys_ioprio_get
+291	i386	inotify_init		sys_inotify_init
+292	i386	inotify_add_watch	sys_inotify_add_watch
+293	i386	inotify_rm_watch	sys_inotify_rm_watch
+294	i386	migrate_pages		sys_migrate_pages
+295	i386	openat			sys_openat			compat_sys_openat
+296	i386	mkdirat			sys_mkdirat
+297	i386	mknodat			sys_mknodat
+298	i386	fchownat		sys_fchownat
+299	i386	futimesat		sys_futimesat_time32
+300	i386	fstatat64		sys_fstatat64			compat_sys_ia32_fstatat64
+301	i386	unlinkat		sys_unlinkat
+302	i386	renameat		sys_renameat
+303	i386	linkat			sys_linkat
+304	i386	symlinkat		sys_symlinkat
+305	i386	readlinkat		sys_readlinkat
+306	i386	fchmodat		sys_fchmodat
+307	i386	faccessat		sys_faccessat
+308	i386	pselect6		sys_pselect6_time32		compat_sys_pselect6_time32
+309	i386	ppoll			sys_ppoll_time32		compat_sys_ppoll_time32
+310	i386	unshare			sys_unshare
+311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
+312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
+313	i386	splice			sys_splice
+314	i386	sync_file_range		sys_ia32_sync_file_range
+315	i386	tee			sys_tee
+316	i386	vmsplice		sys_vmsplice
+317	i386	move_pages		sys_move_pages
+318	i386	getcpu			sys_getcpu
+319	i386	epoll_pwait		sys_epoll_pwait
+320	i386	utimensat		sys_utimensat_time32
+321	i386	signalfd		sys_signalfd			compat_sys_signalfd
+322	i386	timerfd_create		sys_timerfd_create
+323	i386	eventfd			sys_eventfd
+324	i386	fallocate		sys_ia32_fallocate
+325	i386	timerfd_settime		sys_timerfd_settime32
+326	i386	timerfd_gettime		sys_timerfd_gettime32
+327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4
+328	i386	eventfd2		sys_eventfd2
+329	i386	epoll_create1		sys_epoll_create1
+330	i386	dup3			sys_dup3
+331	i386	pipe2			sys_pipe2
+332	i386	inotify_init1		sys_inotify_init1
+333	i386	preadv			sys_preadv			compat_sys_preadv
+334	i386	pwritev			sys_pwritev			compat_sys_pwritev
+335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
+336	i386	perf_event_open		sys_perf_event_open
+337	i386	recvmmsg		sys_recvmmsg_time32		compat_sys_recvmmsg_time32
+338	i386	fanotify_init		sys_fanotify_init
+339	i386	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark
+340	i386	prlimit64		sys_prlimit64
+341	i386	name_to_handle_at	sys_name_to_handle_at
+342	i386	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
+343	i386	clock_adjtime		sys_clock_adjtime32
+344	i386	syncfs			sys_syncfs
+345	i386	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
+346	i386	setns			sys_setns
+347	i386	process_vm_readv	sys_process_vm_readv
+348	i386	process_vm_writev	sys_process_vm_writev
+349	i386	kcmp			sys_kcmp
+350	i386	finit_module		sys_finit_module
+351	i386	sched_setattr		sys_sched_setattr
+352	i386	sched_getattr		sys_sched_getattr
+353	i386	renameat2		sys_renameat2
+354	i386	seccomp			sys_seccomp
+355	i386	getrandom		sys_getrandom
+356	i386	memfd_create		sys_memfd_create
+357	i386	bpf			sys_bpf
+358	i386	execveat		sys_execveat			compat_sys_execveat
+359	i386	socket			sys_socket
+360	i386	socketpair		sys_socketpair
+361	i386	bind			sys_bind
+362	i386	connect			sys_connect
+363	i386	listen			sys_listen
+364	i386	accept4			sys_accept4
+365	i386	getsockopt		sys_getsockopt			sys_getsockopt
+366	i386	setsockopt		sys_setsockopt			sys_setsockopt
+367	i386	getsockname		sys_getsockname
+368	i386	getpeername		sys_getpeername
+369	i386	sendto			sys_sendto
+370	i386	sendmsg			sys_sendmsg			compat_sys_sendmsg
+371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
+372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
+373	i386	shutdown		sys_shutdown
+374	i386	userfaultfd		sys_userfaultfd
+375	i386	membarrier		sys_membarrier
+376	i386	mlock2			sys_mlock2
+377	i386	copy_file_range		sys_copy_file_range
+378	i386	preadv2			sys_preadv2			compat_sys_preadv2
+379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2
+380	i386	pkey_mprotect		sys_pkey_mprotect
+381	i386	pkey_alloc		sys_pkey_alloc
+382	i386	pkey_free		sys_pkey_free
+383	i386	statx			sys_statx
+384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+385	i386	io_pgetevents		sys_io_pgetevents_time32	compat_sys_io_pgetevents
+386	i386	rseq			sys_rseq
+393	i386	semget			sys_semget
+394	i386	semctl			sys_semctl    			compat_sys_semctl
+395	i386	shmget			sys_shmget
+396	i386	shmctl			sys_shmctl    			compat_sys_shmctl
+397	i386	shmat			sys_shmat     			compat_sys_shmat
+398	i386	shmdt			sys_shmdt
+399	i386	msgget			sys_msgget
+400	i386	msgsnd			sys_msgsnd    			compat_sys_msgsnd
+401	i386	msgrcv			sys_msgrcv    			compat_sys_msgrcv
+402	i386	msgctl			sys_msgctl    			compat_sys_msgctl
+403	i386	clock_gettime64		sys_clock_gettime
+404	i386	clock_settime64		sys_clock_settime
+405	i386	clock_adjtime64		sys_clock_adjtime
+406	i386	clock_getres_time64	sys_clock_getres
+407	i386	clock_nanosleep_time64	sys_clock_nanosleep
+408	i386	timer_gettime64		sys_timer_gettime
+409	i386	timer_settime64		sys_timer_settime
+410	i386	timerfd_gettime64	sys_timerfd_gettime
+411	i386	timerfd_settime64	sys_timerfd_settime
+412	i386	utimensat_time64	sys_utimensat
+413	i386	pselect6_time64		sys_pselect6			compat_sys_pselect6_time64
+414	i386	ppoll_time64		sys_ppoll			compat_sys_ppoll_time64
+416	i386	io_pgetevents_time64	sys_io_pgetevents		compat_sys_io_pgetevents_time64
+417	i386	recvmmsg_time64		sys_recvmmsg			compat_sys_recvmmsg_time64
+418	i386	mq_timedsend_time64	sys_mq_timedsend
+419	i386	mq_timedreceive_time64	sys_mq_timedreceive
+420	i386	semtimedop_time64	sys_semtimedop
+421	i386	rt_sigtimedwait_time64	sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
+422	i386	futex_time64		sys_futex
+423	i386	sched_rr_get_interval_time64	sys_sched_rr_get_interval
+424	i386	pidfd_send_signal	sys_pidfd_send_signal
+425	i386	io_uring_setup		sys_io_uring_setup
+426	i386	io_uring_enter		sys_io_uring_enter
+427	i386	io_uring_register	sys_io_uring_register
+428	i386	open_tree		sys_open_tree
+429	i386	move_mount		sys_move_mount
+430	i386	fsopen			sys_fsopen
+431	i386	fsconfig		sys_fsconfig
+432	i386	fsmount			sys_fsmount
+433	i386	fspick			sys_fspick
+434	i386	pidfd_open		sys_pidfd_open
+435	i386	clone3			sys_clone3
+436	i386	close_range		sys_close_range
+437	i386	openat2			sys_openat2
+438	i386	pidfd_getfd		sys_pidfd_getfd
+439	i386	faccessat2		sys_faccessat2
+440	i386	process_madvise		sys_process_madvise
+441	i386	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	i386	mount_setattr		sys_mount_setattr
+443	i386	quotactl_fd		sys_quotactl_fd
+444	i386	landlock_create_ruleset	sys_landlock_create_ruleset
+445	i386	landlock_add_rule	sys_landlock_add_rule
+446	i386	landlock_restrict_self	sys_landlock_restrict_self
+447	i386	memfd_secret		sys_memfd_secret
+448	i386	process_mrelease	sys_process_mrelease
+449	i386	futex_waitv		sys_futex_waitv
+450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	i386	cachestat		sys_cachestat
+452	i386	fchmodat2		sys_fchmodat2
+453	i386	map_shadow_stack	sys_map_shadow_stack
+454	i386	futex_wake		sys_futex_wake
+455	i386	futex_wait		sys_futex_wait
+456	i386	futex_requeue		sys_futex_requeue
+457	i386	statmount		sys_statmount
+458	i386	listmount		sys_listmount
+459	i386	lsm_get_self_attr	sys_lsm_get_self_attr
+460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
+461	i386	lsm_list_modules	sys_lsm_list_modules
+462	i386	mseal 			sys_mseal
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 672421b858ac..714c78e5da07 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -172,6 +172,7 @@ check lib/ctype.c		      '-I "^EXPORT_SYMBOL" -I "^#include <linux/export.h>" -B
 check lib/list_sort.c		      '-I "^#include <linux/bug.h>"'
 
 # diff non-symmetric files
+check_2 tools/perf/arch/x86/entry/syscalls/syscall_32.tbl arch/x86/entry/syscalls/syscall_32.tbl
 check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl
 check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl
 check_2 tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl
diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c
index 0dd26b991b3f..7c15dec6900d 100644
--- a/tools/perf/util/syscalltbl.c
+++ b/tools/perf/util/syscalltbl.c
@@ -18,6 +18,10 @@
 #include <asm/syscalls_64.c>
 const int syscalltbl_native_max_id = SYSCALLTBL_x86_64_MAX_ID;
 static const char *const *syscalltbl_native = syscalltbl_x86_64;
+#elif defined(__i386__)
+#include <asm/syscalls_32.c>
+const int syscalltbl_native_max_id = SYSCALLTBL_x86_MAX_ID;
+static const char *const *syscalltbl_native = syscalltbl_x86;
 #elif defined(__s390x__)
 #include <asm/syscalls_64.c>
 const int syscalltbl_native_max_id = SYSCALLTBL_S390_64_MAX_ID;
-- 
cgit v1.2.3


From 5d2784e25d7a6c216c9a4cbed2026febfb0c094c Mon Sep 17 00:00:00 2001
From: Quentin Monnet <qmo@kernel.org>
Date: Sun, 1 Sep 2024 22:07:42 +0100
Subject: bpftool: Add missing blank lines in bpftool-net doc example

In bpftool-net documentation, two blank lines are missing in a
recently added example, causing docutils to complain:

    $ cd tools/bpf/bpftool
    $ make doc
      DESCEND Documentation
      GEN     bpftool-btf.8
      GEN     bpftool-cgroup.8
      GEN     bpftool-feature.8
      GEN     bpftool-gen.8
      GEN     bpftool-iter.8
      GEN     bpftool-link.8
      GEN     bpftool-map.8
      GEN     bpftool-net.8
    <stdin>:189: (INFO/1) Possible incomplete section title.
    Treating the overline as ordinary text because it's so short.
    <stdin>:192: (INFO/1) Blank line missing before literal block (after the "::")? Interpreted as a definition list item.
    <stdin>:199: (INFO/1) Possible incomplete section title.
    Treating the overline as ordinary text because it's so short.
    <stdin>:201: (INFO/1) Blank line missing before literal block (after the "::")? Interpreted as a definition list item.
      GEN     bpftool-perf.8
      GEN     bpftool-prog.8
      GEN     bpftool.8
      GEN     bpftool-struct_ops.8

Add the missing blank lines.

Fixes: 0d7c06125cea ("bpftool: Add document for net attach/detach on tcx subcommand")
Signed-off-by: Quentin Monnet <qmo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240901210742.25758-1-qmo@kernel.org
---
 tools/bpf/bpftool/Documentation/bpftool-net.rst | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 4a8cb5e0d94b..a9ed8992800f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -187,6 +187,7 @@ EXAMPLES
 |
 
 ::
+
       tc:
       lo(1) tcx/ingress tc_prog prog_id 29
 
@@ -197,4 +198,5 @@ EXAMPLES
 |
 
 ::
+
       tc:
-- 
cgit v1.2.3


From b0222d1d9e6f8551a056b89b0bff38f515f3c9b5 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Mon, 2 Sep 2024 18:17:21 +0100
Subject: bpftool: Fix handling enum64 in btf dump sorting

Wrong function is used to access the first enum64 element. Substituting btf_enum(t)
with btf_enum64(t) for BTF_KIND_ENUM64.

Fixes: 94133cf24bb3 ("bpftool: Introduce btf c dump sorting")
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240902171721.105253-1-mykyta.yatsenko5@gmail.com
---
 tools/bpf/bpftool/btf.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 6789c7a4d5ca..3b57ba095ab6 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -561,9 +561,10 @@ static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool f
 	case BTF_KIND_ENUM64: {
 		int name_off = t->name_off;
 
-		/* Use name of the first element for anonymous enums if allowed */
-		if (!from_ref && !t->name_off && btf_vlen(t))
-			name_off = btf_enum(t)->name_off;
+		if (!from_ref && !name_off && btf_vlen(t))
+			name_off = btf_kind(t) == BTF_KIND_ENUM64 ?
+				btf_enum64(t)->name_off :
+				btf_enum(t)->name_off;
 
 		return btf__name_by_offset(btf, name_off);
 	}
-- 
cgit v1.2.3


From 673f0c3ffc75166317ca5edb66ff35eaa6a12149 Mon Sep 17 00:00:00 2001
From: Michael Grzeschik <m.grzeschik@pengutronix.de>
Date: Sun, 1 Sep 2024 21:11:18 +0200
Subject: tools: usb: p9_fwd: add usb gadget packet forwarder script

This patch is adding an small python tool to forward 9pfs requests
from the USB gadget to an existing 9pfs TCP server. Since currently all
9pfs servers lack support for the usb transport this tool is an useful
helper to get started.

Refer the Documentation section "USBG Example" in
Documentation/filesystems/9p.rst on how to use it.

Signed-off-by: Jan Luebbe <jlu@pengutronix.de>
Signed-off-by: Michael Grzeschik <m.grzeschik@pengutronix.de>
Tested-by: Andrzej Pietrasiewicz <andrzej.p@collabora.com>
Link: https://lore.kernel.org/r/20240116-ml-topic-u9p-v12-3-9a27de5160e0@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/filesystems/9p.rst |  41 +++++++
 tools/usb/p9_fwd.py              | 243 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 284 insertions(+)
 create mode 100755 tools/usb/p9_fwd.py

(limited to 'tools')

diff --git a/Documentation/filesystems/9p.rst b/Documentation/filesystems/9p.rst
index 10cf79dc287f..2cc85f3e8659 100644
--- a/Documentation/filesystems/9p.rst
+++ b/Documentation/filesystems/9p.rst
@@ -67,6 +67,47 @@ To mount a 9p FS on a USB Host accessible via the gadget as root filesystem::
 where <device> is the tag associated by the usb gadget transport.
 It is defined by the configfs instance name.
 
+USBG Example
+============
+
+The USB host exports a filesystem, while the gadget on the USB device
+side makes it mountable.
+
+Diod (9pfs server) and the forwarder are on the development host, where
+the root filesystem is actually stored. The gadget is initialized during
+boot (or later) on the embedded board. Then the forwarder will find it
+on the USB bus and start forwarding requests.
+
+In this case the 9p requests come from the device and are handled by the
+host. The reason is that USB device ports are normally not available on
+PCs, so a connection in the other direction would not work.
+
+When using the usbg transport, for now there is no native usb host
+service capable to handle the requests from the gadget driver. For
+this we have to use the extra python tool p9_fwd.py from tools/usb.
+
+Just start the 9pfs capable network server like diod/nfs-ganesha e.g.:
+
+        $ diod -f -n -d 0 -S -l 0.0.0.0:9999 -e $PWD
+
+Optionaly scan your bus if there are more then one usbg gadgets to find their path:
+
+        $ python $kernel_dir/tools/usb/p9_fwd.py list
+
+        Bus | Addr | Manufacturer     | Product          | ID        | Path
+        --- | ---- | ---------------- | ---------------- | --------- | ----
+          2 |   67 | unknown          | unknown          | 1d6b:0109 | 2-1.1.2
+          2 |   68 | unknown          | unknown          | 1d6b:0109 | 2-1.1.3
+
+Then start the python transport:
+
+        $ python $kernel_dir/tools/usb/p9_fwd.py --path 2-1.1.2 connect -p 9999
+
+After that the gadget driver can be used as described above.
+
+One use-case is to use it as an alternative to NFS root booting during
+the development of embedded Linux devices.
+
 Options
 =======
 
diff --git a/tools/usb/p9_fwd.py b/tools/usb/p9_fwd.py
new file mode 100755
index 000000000000..12c76cbb046b
--- /dev/null
+++ b/tools/usb/p9_fwd.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import argparse
+import errno
+import logging
+import socket
+import struct
+import time
+
+import usb.core
+import usb.util
+
+
+def path_from_usb_dev(dev):
+    """Takes a pyUSB device as argument and returns a string.
+    The string is a Path representation of the position of the USB device on the USB bus tree.
+
+    This path is used to find a USB device on the bus or all devices connected to a HUB.
+    The path is made up of the number of the USB controller followed be the ports of the HUB tree."""
+    if dev.port_numbers:
+        dev_path = ".".join(str(i) for i in dev.port_numbers)
+        return f"{dev.bus}-{dev_path}"
+    return ""
+
+
+HEXDUMP_FILTER = "".join(chr(x).isprintable() and chr(x) or "." for x in range(128)) + "." * 128
+
+
+class Forwarder:
+    @staticmethod
+    def _log_hexdump(data):
+        if not logging.root.isEnabledFor(logging.TRACE):
+            return
+        L = 16
+        for c in range(0, len(data), L):
+            chars = data[c : c + L]
+            dump = " ".join(f"{x:02x}" for x in chars)
+            printable = "".join(HEXDUMP_FILTER[x] for x in chars)
+            line = f"{c:08x}  {dump:{L*3}s} |{printable:{L}s}|"
+            logging.root.log(logging.TRACE, "%s", line)
+
+    def __init__(self, server, vid, pid, path):
+        self.stats = {
+            "c2s packets": 0,
+            "c2s bytes": 0,
+            "s2c packets": 0,
+            "s2c bytes": 0,
+        }
+        self.stats_logged = time.monotonic()
+
+        def find_filter(dev):
+            dev_path = path_from_usb_dev(dev)
+            if path is not None:
+                return dev_path == path
+            return True
+
+        dev = usb.core.find(idVendor=vid, idProduct=pid, custom_match=find_filter)
+        if dev is None:
+            raise ValueError("Device not found")
+
+        logging.info(f"found device: {dev.bus}/{dev.address} located at {path_from_usb_dev(dev)}")
+
+        # dev.set_configuration() is not necessary since g_multi has only one
+        usb9pfs = None
+        # g_multi adds 9pfs as last interface
+        cfg = dev.get_active_configuration()
+        for intf in cfg:
+            # we have to detach the usb-storage driver from multi gadget since
+            # stall option could be set, which will lead to spontaneous port
+            # resets and our transfers will run dead
+            if intf.bInterfaceClass == 0x08:
+                if dev.is_kernel_driver_active(intf.bInterfaceNumber):
+                    dev.detach_kernel_driver(intf.bInterfaceNumber)
+
+            if intf.bInterfaceClass == 0xFF and intf.bInterfaceSubClass == 0xFF and intf.bInterfaceProtocol == 0x09:
+                usb9pfs = intf
+        if usb9pfs is None:
+            raise ValueError("Interface not found")
+
+        logging.info(f"claiming interface:\n{usb9pfs}")
+        usb.util.claim_interface(dev, usb9pfs.bInterfaceNumber)
+        ep_out = usb.util.find_descriptor(
+            usb9pfs,
+            custom_match=lambda e: usb.util.endpoint_direction(e.bEndpointAddress) == usb.util.ENDPOINT_OUT,
+        )
+        assert ep_out is not None
+        ep_in = usb.util.find_descriptor(
+            usb9pfs,
+            custom_match=lambda e: usb.util.endpoint_direction(e.bEndpointAddress) == usb.util.ENDPOINT_IN,
+        )
+        assert ep_in is not None
+        logging.info("interface claimed")
+
+        self.ep_out = ep_out
+        self.ep_in = ep_in
+        self.dev = dev
+
+        # create and connect socket
+        self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.s.connect(server)
+
+        logging.info("connected to server")
+
+    def c2s(self):
+        """forward a request from the USB client to the TCP server"""
+        data = None
+        while data is None:
+            try:
+                logging.log(logging.TRACE, "c2s: reading")
+                data = self.ep_in.read(self.ep_in.wMaxPacketSize)
+            except usb.core.USBTimeoutError:
+                logging.log(logging.TRACE, "c2s: reading timed out")
+                continue
+            except usb.core.USBError as e:
+                if e.errno == errno.EIO:
+                    logging.debug("c2s: reading failed with %s, retrying", repr(e))
+                    time.sleep(0.5)
+                    continue
+                logging.error("c2s: reading failed with %s, aborting", repr(e))
+                raise
+        size = struct.unpack("<I", data[:4])[0]
+        while len(data) < size:
+            data += self.ep_in.read(size - len(data))
+        logging.log(logging.TRACE, "c2s: writing")
+        self._log_hexdump(data)
+        self.s.send(data)
+        logging.debug("c2s: forwarded %i bytes", size)
+        self.stats["c2s packets"] += 1
+        self.stats["c2s bytes"] += size
+
+    def s2c(self):
+        """forward a response from the TCP server to the USB client"""
+        logging.log(logging.TRACE, "s2c: reading")
+        data = self.s.recv(4)
+        size = struct.unpack("<I", data[:4])[0]
+        while len(data) < size:
+            data += self.s.recv(size - len(data))
+        logging.log(logging.TRACE, "s2c: writing")
+        self._log_hexdump(data)
+        while data:
+            written = self.ep_out.write(data)
+            assert written > 0
+            data = data[written:]
+        if size % self.ep_out.wMaxPacketSize == 0:
+            logging.log(logging.TRACE, "sending zero length packet")
+            self.ep_out.write(b"")
+        logging.debug("s2c: forwarded %i bytes", size)
+        self.stats["s2c packets"] += 1
+        self.stats["s2c bytes"] += size
+
+    def log_stats(self):
+        logging.info("statistics:")
+        for k, v in self.stats.items():
+            logging.info(f"  {k+':':14s} {v}")
+
+    def log_stats_interval(self, interval=5):
+        if (time.monotonic() - self.stats_logged) < interval:
+            return
+
+        self.log_stats()
+        self.stats_logged = time.monotonic()
+
+
+def try_get_usb_str(dev, name):
+    try:
+        with open(f"/sys/bus/usb/devices/{dev.bus}-{dev.address}/{name}") as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        return None
+
+
+def list_usb(args):
+    vid, pid = [int(x, 16) for x in args.id.split(":", 1)]
+
+    print("Bus | Addr | Manufacturer     | Product          | ID        | Path")
+    print("--- | ---- | ---------------- | ---------------- | --------- | ----")
+    for dev in usb.core.find(find_all=True, idVendor=vid, idProduct=pid):
+        path = path_from_usb_dev(dev) or ""
+        manufacturer = try_get_usb_str(dev, "manufacturer") or "unknown"
+        product = try_get_usb_str(dev, "product") or "unknown"
+        print(
+            f"{dev.bus:3} | {dev.address:4} | {manufacturer:16} | {product:16} | {dev.idVendor:04x}:{dev.idProduct:04x} | {path:18}"
+        )
+
+
+def connect(args):
+    vid, pid = [int(x, 16) for x in args.id.split(":", 1)]
+
+    f = Forwarder(server=(args.server, args.port), vid=vid, pid=pid, path=args.path)
+
+    try:
+        while True:
+            f.c2s()
+            f.s2c()
+            f.log_stats_interval()
+    finally:
+        f.log_stats()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Forward 9PFS requests from USB to TCP",
+    )
+
+    parser.add_argument("--id", type=str, default="1d6b:0109", help="vid:pid of target device")
+    parser.add_argument("--path", type=str, required=False, help="path of target device")
+    parser.add_argument("-v", "--verbose", action="count", default=0)
+
+    subparsers = parser.add_subparsers()
+    subparsers.required = True
+    subparsers.dest = "command"
+
+    parser_list = subparsers.add_parser("list", help="List all connected 9p gadgets")
+    parser_list.set_defaults(func=list_usb)
+
+    parser_connect = subparsers.add_parser(
+        "connect", help="Forward messages between the usb9pfs gadget and the 9p server"
+    )
+    parser_connect.set_defaults(func=connect)
+    connect_group = parser_connect.add_argument_group()
+    connect_group.required = True
+    parser_connect.add_argument("-s", "--server", type=str, default="127.0.0.1", help="server hostname")
+    parser_connect.add_argument("-p", "--port", type=int, default=564, help="server port")
+
+    args = parser.parse_args()
+
+    logging.TRACE = logging.DEBUG - 5
+    logging.addLevelName(logging.TRACE, "TRACE")
+
+    if args.verbose >= 2:
+        level = logging.TRACE
+    elif args.verbose:
+        level = logging.DEBUG
+    else:
+        level = logging.INFO
+    logging.basicConfig(level=level, format="%(asctime)-15s %(levelname)-8s %(message)s")
+
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From 05c1280a2bcfca187fe7fa90bb240602cf54af0a Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 29 Aug 2024 14:33:38 +0200
Subject: netdev_features: convert NETIF_F_NETNS_LOCAL to dev->netns_local

"Interface can't change network namespaces" is rather an attribute,
not a feature, and it can't be changed via Ethtool.
Make it a "cold" private flag instead of a netdev_feature and free
one more bit.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/net_cachelines/net_device.rst |  1 +
 Documentation/networking/netdev-features.rst           |  7 -------
 Documentation/networking/switchdev.rst                 |  4 ++--
 drivers/net/amt.c                                      |  2 +-
 drivers/net/bonding/bond_main.c                        |  6 +++---
 drivers/net/ethernet/adi/adin1110.c                    |  2 +-
 drivers/net/ethernet/marvell/prestera/prestera_main.c  |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c      |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c       |  3 ++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c         |  5 +++--
 drivers/net/ethernet/rocker/rocker_main.c              |  3 ++-
 drivers/net/ethernet/ti/cpsw_new.c                     |  3 ++-
 drivers/net/loopback.c                                 |  2 +-
 drivers/net/net_failover.c                             |  2 +-
 drivers/net/team/team_core.c                           |  6 +++---
 drivers/net/vrf.c                                      |  2 +-
 include/linux/netdev_features.h                        |  6 ++----
 include/linux/netdevice.h                              |  2 ++
 net/batman-adv/soft-interface.c                        |  3 ++-
 net/bridge/br_device.c                                 |  5 +++--
 net/core/dev.c                                         |  4 ++--
 net/ethtool/common.c                                   |  1 -
 net/hsr/hsr_device.c                                   |  8 ++++----
 net/ieee802154/6lowpan/core.c                          |  2 +-
 net/ieee802154/core.c                                  | 10 +++++-----
 net/ipv4/ip_tunnel.c                                   |  2 +-
 net/ipv4/ipmr.c                                        |  2 +-
 net/ipv6/ip6_gre.c                                     |  3 +--
 net/ipv6/ip6_tunnel.c                                  |  2 +-
 net/ipv6/ip6mr.c                                       |  2 +-
 net/ipv6/sit.c                                         |  2 +-
 net/openvswitch/vport-internal_dev.c                   |  2 +-
 net/wireless/core.c                                    | 10 +++++-----
 tools/testing/selftests/net/forwarding/README          |  2 +-
 34 files changed, 61 insertions(+), 62 deletions(-)

(limited to 'tools')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index e55319277074..e649d8e9556f 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -166,6 +166,7 @@ unsigned:1                          wol_enabled
 unsigned:1                          threaded                -                   -                   napi_poll(napi_enable,dev_set_threaded)
 unsigned_long:1                     see_all_hwtstamp_requests                                       
 unsigned_long:1                     change_proto_down                                               
+unsigned_long:1                     netns_local                                                     
 struct_list_head                    net_notifier_list                                               
 struct_macsec_ops*                  macsec_ops                                                      
 struct_udp_tunnel_nic_info*         udp_tunnel_nic_info                                             
diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst
index f29d982ebf5d..5014f7cc1398 100644
--- a/Documentation/networking/netdev-features.rst
+++ b/Documentation/networking/netdev-features.rst
@@ -139,13 +139,6 @@ chained skbs (skb->next/prev list).
 Features contained in NETIF_F_SOFT_FEATURES are features of networking
 stack. Driver should not change behaviour based on them.
 
- * netns-local device
-
-NETIF_F_NETNS_LOCAL is set for devices that are not allowed to move between
-network namespaces (e.g. loopback).
-
-Don't use it in drivers.
-
  * VLAN challenged
 
 NETIF_F_VLAN_CHALLENGED should be set for devices which can't cope with VLAN
diff --git a/Documentation/networking/switchdev.rst b/Documentation/networking/switchdev.rst
index 758f1dae3fce..f355f0166f1b 100644
--- a/Documentation/networking/switchdev.rst
+++ b/Documentation/networking/switchdev.rst
@@ -137,10 +137,10 @@ would be sub-port 0 on port 1 on switch 1.
 Port Features
 ^^^^^^^^^^^^^
 
-NETIF_F_NETNS_LOCAL
+dev->netns_local
 
 If the switchdev driver (and device) only supports offloading of the default
-network namespace (netns), the driver should set this feature flag to prevent
+network namespace (netns), the driver should set this private flag to prevent
 the port netdev from being moved out of the default netns.  A netns-aware
 driver/device would not set this flag and be responsible for partitioning
 hardware to preserve netns containment.  This means hardware cannot forward
diff --git a/drivers/net/amt.c b/drivers/net/amt.c
index 921bbfd72a38..0433a0f36d1b 100644
--- a/drivers/net/amt.c
+++ b/drivers/net/amt.c
@@ -3099,8 +3099,8 @@ static void amt_link_setup(struct net_device *dev)
 	dev->addr_len		= 0;
 	dev->priv_flags		|= IFF_NO_QUEUE;
 	dev->lltx		= true;
+	dev->netns_local	= true;
 	dev->features		|= NETIF_F_GSO_SOFTWARE;
-	dev->features		|= NETIF_F_NETNS_LOCAL;
 	dev->hw_features	|= NETIF_F_SG | NETIF_F_HW_CSUM;
 	dev->hw_features	|= NETIF_F_FRAGLIST | NETIF_F_RXCSUM;
 	dev->hw_features	|= NETIF_F_GSO_SOFTWARE;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8ae887cdc231..f13d413ad26c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5930,6 +5930,9 @@ void bond_setup(struct net_device *bond_dev)
 	/* don't acquire bond device's netif_tx_lock when transmitting */
 	bond_dev->lltx = true;
 
+	/* Don't allow bond devices to change network namespaces. */
+	bond_dev->netns_local = true;
+
 	/* By default, we declare the bond to be fully
 	 * VLAN hardware accelerated capable. Special
 	 * care is taken in the various xmit functions
@@ -5937,9 +5940,6 @@ void bond_setup(struct net_device *bond_dev)
 	 * capable
 	 */
 
-	/* Don't allow bond devices to change network namespaces. */
-	bond_dev->features |= NETIF_F_NETNS_LOCAL;
-
 	bond_dev->hw_features = BOND_VLAN_FEATURES |
 				NETIF_F_HW_VLAN_CTAG_RX |
 				NETIF_F_HW_VLAN_CTAG_FILTER |
diff --git a/drivers/net/ethernet/adi/adin1110.c b/drivers/net/ethernet/adi/adin1110.c
index 0713f1e2c7f3..3431a7e62b0d 100644
--- a/drivers/net/ethernet/adi/adin1110.c
+++ b/drivers/net/ethernet/adi/adin1110.c
@@ -1599,7 +1599,7 @@ static int adin1110_probe_netdevs(struct adin1110_priv *priv)
 		netdev->netdev_ops = &adin1110_netdev_ops;
 		netdev->ethtool_ops = &adin1110_ethtool_ops;
 		netdev->priv_flags |= IFF_UNICAST_FLT;
-		netdev->features |= NETIF_F_NETNS_LOCAL;
+		netdev->netns_local = true;
 
 		port_priv->phydev = get_phy_device(priv->mii_bus, i + 1, false);
 		if (IS_ERR(port_priv->phydev)) {
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index 63ae01954dfc..22ca6ee9665e 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -633,7 +633,8 @@ static int prestera_port_create(struct prestera_switch *sw, u32 id)
 	if (err)
 		goto err_dl_port_register;
 
-	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_HW_TC;
+	dev->features |= NETIF_F_HW_TC;
+	dev->netns_local = true;
 	dev->netdev_ops = &prestera_netdev_ops;
 	dev->ethtool_ops = &prestera_ethtool_ops;
 	SET_NETDEV_DEV(dev, sw->dev->dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 16b67c457b60..47e7a80d221b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4414,9 +4414,9 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 
 	if (mlx5e_is_uplink_rep(priv)) {
 		features = mlx5e_fix_uplink_rep_features(netdev, features);
-		features |= NETIF_F_NETNS_LOCAL;
+		netdev->netns_local = true;
 	} else {
-		features &= ~NETIF_F_NETNS_LOCAL;
+		netdev->netns_local = false;
 	}
 
 	mutex_unlock(&priv->state_lock);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index b885042eef14..92094bf60d59 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -898,7 +898,8 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev,
 	netdev->hw_features    |= NETIF_F_RXCSUM;
 
 	netdev->features |= netdev->hw_features;
-	netdev->features |= NETIF_F_NETNS_LOCAL;
+
+	netdev->netns_local = true;
 }
 
 static int mlx5e_init_rep(struct mlx5_core_dev *mdev,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 44d6e125bd6f..b9ffd7236aff 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1676,10 +1676,11 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u16 local_port,
 
 	netif_carrier_off(dev);
 
-	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_SG |
-			 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC;
+	dev->features |= NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_FILTER |
+			 NETIF_F_HW_TC;
 	dev->hw_features |= NETIF_F_HW_TC | NETIF_F_LOOPBACK;
 	dev->lltx = true;
+	dev->netns_local = true;
 
 	dev->min_mtu = ETH_MIN_MTU;
 	dev->max_mtu = MLXSW_PORT_MAX_MTU - MLXSW_PORT_ETH_FRAME_HDR;
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index e097ce3e69ea..84fa911c78db 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2575,7 +2575,8 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 	netif_napi_add(dev, &rocker_port->napi_rx, rocker_port_poll_rx);
 	rocker_carrier_init(rocker_port);
 
-	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_SG;
+	dev->features |= NETIF_F_SG;
+	dev->netns_local = true;
 
 	/* MTU range: 68 - 9000 */
 	dev->min_mtu = ROCKER_PORT_MIN_MTU;
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 2baa198ebfa0..557cc71b9dd2 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1407,7 +1407,8 @@ static int cpsw_create_ports(struct cpsw_common *cpsw)
 		cpsw->slaves[i].ndev = ndev;
 
 		ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |
-				  NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_NETNS_LOCAL | NETIF_F_HW_TC;
+				  NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_TC;
+		ndev->netns_local = true;
 
 		ndev->xdp_features = NETDEV_XDP_ACT_BASIC |
 				     NETDEV_XDP_ACT_REDIRECT |
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index bf857782be0f..1993b90b1a5f 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -172,6 +172,7 @@ static void gen_lo_setup(struct net_device *dev,
 	dev->flags		= IFF_LOOPBACK;
 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
 	dev->lltx		= true;
+	dev->netns_local	= true;
 	netif_keep_dst(dev);
 	dev->hw_features	= NETIF_F_GSO_SOFTWARE;
 	dev->features		= NETIF_F_SG | NETIF_F_FRAGLIST
@@ -180,7 +181,6 @@ static void gen_lo_setup(struct net_device *dev,
 		| NETIF_F_RXCSUM
 		| NETIF_F_SCTP_CRC
 		| NETIF_F_HIGHDMA
-		| NETIF_F_NETNS_LOCAL
 		| NETIF_F_VLAN_CHALLENGED
 		| NETIF_F_LOOPBACK;
 	dev->ethtool_ops	= eth_ops;
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 06728385a35f..54c8b9d5b5fc 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -734,7 +734,7 @@ struct failover *net_failover_create(struct net_device *standby_dev)
 	failover_dev->lltx = true;
 
 	/* Don't allow failover devices to change network namespaces. */
-	failover_dev->features |= NETIF_F_NETNS_LOCAL;
+	failover_dev->netns_local = true;
 
 	failover_dev->hw_features = FAILOVER_VLAN_FEATURES |
 				    NETIF_F_HW_VLAN_CTAG_TX |
diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c
index 1d1bad3cedc2..18191d5a8bd4 100644
--- a/drivers/net/team/team_core.c
+++ b/drivers/net/team/team_core.c
@@ -2191,10 +2191,10 @@ static void team_setup(struct net_device *dev)
 	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
 	dev->lltx = true;
 
-	dev->features |= NETIF_F_GRO;
-
 	/* Don't allow team devices to change network namespaces. */
-	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->netns_local = true;
+
+	dev->features |= NETIF_F_GRO;
 
 	dev->hw_features = TEAM_VLAN_FEATURES |
 			   NETIF_F_HW_VLAN_CTAG_RX |
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index b1d4ef538995..4d8ccaf9a2b4 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1638,7 +1638,7 @@ static void vrf_setup(struct net_device *dev)
 	dev->lltx = true;
 
 	/* don't allow vrf devices to change network namespaces. */
-	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->netns_local = true;
 
 	/* does not make sense for a VLAN to be added to a vrf device */
 	dev->features   |= NETIF_F_VLAN_CHALLENGED;
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a2e20b517584..d5a3836f4793 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -25,7 +25,7 @@ enum {
 	NETIF_F_VLAN_CHALLENGED_BIT,	/* Device cannot handle VLAN packets */
 	NETIF_F_GSO_BIT,		/* Enable software GSO. */
 	__UNUSED_NETIF_F_12,
-	NETIF_F_NETNS_LOCAL_BIT,	/* Does not change network namespaces */
+	__UNUSED_NETIF_F_13,
 	NETIF_F_GRO_BIT,		/* Generic receive offload */
 	NETIF_F_LRO_BIT,		/* large receive offload */
 
@@ -121,7 +121,6 @@ enum {
 #define NETIF_F_IPV6_CSUM	__NETIF_F(IPV6_CSUM)
 #define NETIF_F_LOOPBACK	__NETIF_F(LOOPBACK)
 #define NETIF_F_LRO		__NETIF_F(LRO)
-#define NETIF_F_NETNS_LOCAL	__NETIF_F(NETNS_LOCAL)
 #define NETIF_F_NOCACHE_COPY	__NETIF_F(NOCACHE_COPY)
 #define NETIF_F_NTUPLE		__NETIF_F(NTUPLE)
 #define NETIF_F_RXCSUM		__NETIF_F(RXCSUM)
@@ -190,8 +189,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
-#define NETIF_F_NEVER_CHANGE	(NETIF_F_VLAN_CHALLENGED | \
-				 NETIF_F_NETNS_LOCAL)
+#define NETIF_F_NEVER_CHANGE	NETIF_F_VLAN_CHALLENGED
 
 /* remember that ((t)1 << t_BITS) is undefined in C99 */
 #define NETIF_F_ETHTOOL_BITS	((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e22ca5e07490..a698e2402420 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1968,6 +1968,7 @@ enum netdev_reg_state {
  *			regardless of source, even if those aren't
  *			HWTSTAMP_SOURCE_NETDEV
  *	@change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN
+ *	@netns_local: interface can't change network namespaces
  *
  *	@net_notifier_list:	List of per-net netdev notifier block
  *				that follow this device when it is moved
@@ -2361,6 +2362,7 @@ struct net_device {
 	/* priv_flags_slow, ungrouped to save space */
 	unsigned long		see_all_hwtstamp_requests:1;
 	unsigned long		change_proto_down:1;
+	unsigned long		netns_local:1;
 
 	struct list_head	net_notifier_list;
 
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index e791a73ef901..2758aba47a2f 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1020,9 +1020,10 @@ static void batadv_softif_init_early(struct net_device *dev)
 	dev->netdev_ops = &batadv_netdev_ops;
 	dev->needs_free_netdev = true;
 	dev->priv_destructor = batadv_softif_free;
-	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 	dev->priv_flags |= IFF_NO_QUEUE;
 	dev->lltx = true;
+	dev->netns_local = true;
 
 	/* can't call min_mtu, because the needed variables
 	 * have not been initialized yet
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index a6d25113dfb1..26b79feb385d 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -488,9 +488,10 @@ void br_dev_setup(struct net_device *dev)
 	SET_NETDEV_DEVTYPE(dev, &br_type);
 	dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
 	dev->lltx = true;
+	dev->netns_local = true;
 
-	dev->features = COMMON_FEATURES | NETIF_F_NETNS_LOCAL |
-			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
+	dev->features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
+			NETIF_F_HW_VLAN_STAG_TX;
 	dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
 			   NETIF_F_HW_VLAN_STAG_TX;
 	dev->vlan_features = COMMON_FEATURES;
diff --git a/net/core/dev.c b/net/core/dev.c
index 56aed88ceadf..98bb5f890b88 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11487,7 +11487,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
 
 	/* Don't allow namespace local devices to be moved. */
 	err = -EINVAL;
-	if (dev->features & NETIF_F_NETNS_LOCAL)
+	if (dev->netns_local)
 		goto out;
 
 	/* Ensure the device has been registered */
@@ -11869,7 +11869,7 @@ static void __net_exit default_device_exit_net(struct net *net)
 		char fb_name[IFNAMSIZ];
 
 		/* Ignore unmoveable devices (i.e. loopback) */
-		if (dev->features & NETIF_F_NETNS_LOCAL)
+		if (dev->netns_local)
 			continue;
 
 		/* Leave virtual devices for the generic cleanup */
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 447fc80e1b00..ca8e64162104 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -25,7 +25,6 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
 	[NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
 	[NETIF_F_VLAN_CHALLENGED_BIT] =  "vlan-challenged",
 	[NETIF_F_GSO_BIT] =              "tx-generic-segmentation",
-	[NETIF_F_NETNS_LOCAL_BIT] =      "netns-local",
 	[NETIF_F_GRO_BIT] =              "rx-gro",
 	[NETIF_F_GRO_HW_BIT] =           "rx-gro-hw",
 	[NETIF_F_LRO_BIT] =              "rx-lro",
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index d4c783076662..a06e790042e2 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -556,6 +556,10 @@ void hsr_dev_setup(struct net_device *dev)
 	dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL;
 	/* Prevent recursive tx locking */
 	dev->lltx = true;
+	/* Not sure about this. Taken from bridge code. netdevice.h says
+	 * it means "Does not change network namespaces".
+	 */
+	dev->netns_local = true;
 
 	dev->needs_free_netdev = true;
 
@@ -569,10 +573,6 @@ void hsr_dev_setup(struct net_device *dev)
 	 * hsr_header_create() etc.
 	 */
 	dev->features |= NETIF_F_VLAN_CHALLENGED;
-	/* Not sure about this. Taken from bridge code. netdev_features.h says
-	 * it means "Does not change network namespaces".
-	 */
-	dev->features |= NETIF_F_NETNS_LOCAL;
 }
 
 /* Return true if dev is a HSR master; return false otherwise.
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 77b4e92027c5..175efd860f7b 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -116,7 +116,7 @@ static void lowpan_setup(struct net_device *ldev)
 	ldev->netdev_ops	= &lowpan_netdev_ops;
 	ldev->header_ops	= &lowpan_header_ops;
 	ldev->needs_free_netdev	= true;
-	ldev->features		|= NETIF_F_NETNS_LOCAL;
+	ldev->netns_local	= true;
 }
 
 static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[],
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index 60e8fff1347e..88adb04e4072 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -226,11 +226,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
 	list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
 		if (!wpan_dev->netdev)
 			continue;
-		wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+		wpan_dev->netdev->netns_local = false;
 		err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d");
 		if (err)
 			break;
-		wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
+		wpan_dev->netdev->netns_local = true;
 	}
 
 	if (err) {
@@ -242,11 +242,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
 						     list) {
 			if (!wpan_dev->netdev)
 				continue;
-			wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+			wpan_dev->netdev->netns_local = false;
 			err = dev_change_net_namespace(wpan_dev->netdev, net,
 						       "wpan%d");
 			WARN_ON(err);
-			wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
+			wpan_dev->netdev->netns_local = true;
 		}
 
 		return err;
@@ -291,7 +291,7 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb,
 	switch (state) {
 		/* TODO NETDEV_DEVTYPE */
 	case NETDEV_REGISTER:
-		dev->features |= NETIF_F_NETNS_LOCAL;
+		dev->netns_local = true;
 		wpan_dev->identifier = ++rdev->wpan_dev_id;
 		list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list);
 		rdev->devlist_generation++;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0cd2f3de100c..18964394d6bd 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1161,7 +1161,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 	 * Allowing to move it to another netns is clearly unsafe.
 	 */
 	if (!IS_ERR(itn->fb_tunnel_dev)) {
-		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+		itn->fb_tunnel_dev->netns_local = true;
 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
 		itn->type = itn->fb_tunnel_dev->type;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index b0eda745e3bc..f1a43199551b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -537,7 +537,7 @@ static void reg_vif_setup(struct net_device *dev)
 	dev->flags		= IFF_NOARP;
 	dev->netdev_ops		= &reg_vif_netdev_ops;
 	dev->needs_free_netdev	= true;
-	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->netns_local	= true;
 }
 
 static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 08beab638bda..235808cfec70 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1621,8 +1621,7 @@ static int __net_init ip6gre_init_net(struct net *net)
 	/* FB netdevice is special: we have one, and only one per netns.
 	 * Allowing to move it to another netns is clearly unsafe.
 	 */
-	ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
-
+	ign->fb_tunnel_dev->netns_local = true;
 
 	ip6gre_fb_tunnel_init(ign->fb_tunnel_dev);
 	ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index cbef0fcece71..ec51ab5063e8 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -2258,7 +2258,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
 	/* FB netdevice is special: we have one, and only one per netns.
 	 * Allowing to move it to another netns is clearly unsafe.
 	 */
-	ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL;
+	ip6n->fb_tnl_dev->netns_local = true;
 
 	err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
 	if (err < 0)
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e3ee93c562c3..2ce4ae0d8dc3 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -640,7 +640,7 @@ static void reg_vif_setup(struct net_device *dev)
 	dev->flags		= IFF_NOARP;
 	dev->netdev_ops		= &reg_vif_netdev_ops;
 	dev->needs_free_netdev	= true;
-	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->netns_local	= true;
 }
 
 static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 58306522fb1e..16b90a24c9ba 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1856,7 +1856,7 @@ static int __net_init sit_init_net(struct net *net)
 	/* FB netdevice is special: we have one, and only one per netns.
 	 * Allowing to move it to another netns is clearly unsafe.
 	 */
-	sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+	sitn->fb_tunnel_dev->netns_local = true;
 
 	err = register_netdev(sitn->fb_tunnel_dev);
 	if (err)
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 3a369a31c5cc..5858d65ea1a9 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -149,7 +149,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
 
 	/* Restrict bridge port to current netns. */
 	if (vport->port_no == OVSP_LOCAL)
-		vport->dev->features |= NETIF_F_NETNS_LOCAL;
+		vport->dev->netns_local = true;
 
 	rtnl_lock();
 	err = register_netdevice(vport->dev);
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 4d5d351bd0b5..661adfc77644 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -165,11 +165,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
 	list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
 		if (!wdev->netdev)
 			continue;
-		wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+		wdev->netdev->netns_local = false;
 		err = dev_change_net_namespace(wdev->netdev, net, "wlan%d");
 		if (err)
 			break;
-		wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
+		wdev->netdev->netns_local = true;
 	}
 
 	if (err) {
@@ -181,11 +181,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
 						     list) {
 			if (!wdev->netdev)
 				continue;
-			wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+			wdev->netdev->netns_local = false;
 			err = dev_change_net_namespace(wdev->netdev, net,
 							"wlan%d");
 			WARN_ON(err);
-			wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
+			wdev->netdev->netns_local = true;
 		}
 
 		return err;
@@ -1473,7 +1473,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 		SET_NETDEV_DEVTYPE(dev, &wiphy_type);
 		wdev->netdev = dev;
 		/* can only change netns with wiphy */
-		dev->features |= NETIF_F_NETNS_LOCAL;
+		dev->netns_local = true;
 
 		cfg80211_init_wdev(wdev);
 		break;
diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README
index 7fdb6a9ca543..a652429bfd53 100644
--- a/tools/testing/selftests/net/forwarding/README
+++ b/tools/testing/selftests/net/forwarding/README
@@ -6,7 +6,7 @@ to easily create and test complex environments.
 
 Unfortunately, these namespaces can not be used with actual switching
 ASICs, as their ports can not be migrated to other network namespaces
-(NETIF_F_NETNS_LOCAL) and most of them probably do not support the
+(dev->netns_local) and most of them probably do not support the
 L1-separation provided by namespaces.
 
 However, a similar kind of flexibility can be achieved by using VRFs and
-- 
cgit v1.2.3


From 5ceb87dc76ab269c940541ad9487cf0f3c0c793d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 30 Aug 2024 11:22:39 +0200
Subject: selftests: netfilter: nft_queue.sh: fix spurious timeout on debug
 kernel

The sctp selftest is very slow on debug kernels.

Its possible that the nf_queue listener program exits due to timeout
before first sctp packet is processed.

In this case socat hangs until script times out.
Fix this by removing the -t option where possible and kill the test
program once the file transfer/socat has exited.

-t sets SO_RCVTIMEO, its inteded for the 'ping' part of the selftest
where we want to make sure that packets get reinjected properly without
skipping a second queue request.

While at it, add a helper to compare the (binary) files instead of diff.
The 'diff' part was copied from a another sub-test that compares text.

Let helper dump file sizes on error so we can see the progress made.

Tested on an old 2010-ish box with a debug kernel and 100 iterations.

This is a followup to the earlier filesize reduction change.

Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20240829080109.GB30766@breakpoint.cc/
Fixes: 0a8b08c554da ("selftests: netfilter: nft_queue.sh: reduce test file size for debug build")
Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://patch.msgid.link/20240830092254.8029-1-fw@strlen.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/netfilter/nft_queue.sh | 62 ++++++++++++++--------
 1 file changed, 40 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
index 9e5f423bff09..d66e3c4dfec6 100755
--- a/tools/testing/selftests/net/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -8,7 +8,7 @@
 
 source lib.sh
 ret=0
-timeout=2
+timeout=5
 
 cleanup()
 {
@@ -255,17 +255,19 @@ listener_ready()
 
 test_tcp_forward()
 {
-	ip netns exec "$nsrouter" ./nf_queue -q 2 -t "$timeout" &
+	ip netns exec "$nsrouter" ./nf_queue -q 2 &
 	local nfqpid=$!
 
 	timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
 	local rpid=$!
 
 	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2"
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 2
 
 	ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
 
 	wait "$rpid" && echo "PASS: tcp and nfqueue in forward chain"
+	kill "$nfqpid"
 }
 
 test_tcp_localhost()
@@ -273,26 +275,29 @@ test_tcp_localhost()
 	timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
 	local rpid=$!
 
-	ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" &
+	ip netns exec "$nsrouter" ./nf_queue -q 3 &
 	local nfqpid=$!
 
 	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter"
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3
 
 	ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null
 
 	wait "$rpid" && echo "PASS: tcp via loopback"
-	wait 2>/dev/null
+	kill "$nfqpid"
 }
 
 test_tcp_localhost_connectclose()
 {
-	ip netns exec "$nsrouter" ./connect_close -p 23456 -t "$timeout" &
-	ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" &
+	ip netns exec "$nsrouter" ./nf_queue -q 3 &
+	local nfqpid=$!
 
 	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3
 
+	timeout 10 ip netns exec "$nsrouter" ./connect_close -p 23456 -t 3
+
+	kill "$nfqpid"
 	wait && echo "PASS: tcp via loopback with connect/close"
-	wait 2>/dev/null
 }
 
 test_tcp_localhost_requeue()
@@ -357,7 +362,7 @@ table inet filter {
 	}
 }
 EOF
-	ip netns exec "$ns1" ./nf_queue -q 1 -t "$timeout" &
+	ip netns exec "$ns1" ./nf_queue -q 1 &
 	local nfqpid=$!
 
 	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 1
@@ -367,6 +372,7 @@ EOF
 	for n in output post; do
 		for d in tvrf eth0; do
 			if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then
+				kill "$nfqpid"
 				echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2
 				ip netns exec "$ns1" nft list ruleset
 				ret=1
@@ -375,8 +381,8 @@ EOF
 		done
 	done
 
-	wait "$nfqpid" && echo "PASS: icmp+nfqueue via vrf"
-	wait 2>/dev/null
+	kill "$nfqpid"
+	echo "PASS: icmp+nfqueue via vrf"
 }
 
 sctp_listener_ready()
@@ -384,6 +390,22 @@ sctp_listener_ready()
 	ss -S -N "$1" -ln -o "sport = :12345" | grep -q 12345
 }
 
+check_output_files()
+{
+	local f1="$1"
+	local f2="$2"
+	local err="$3"
+
+	if ! cmp "$f1" "$f2" ; then
+		echo "FAIL: $err: input and output file differ" 1>&2
+		echo -n " Input file" 1>&2
+		ls -l "$f1" 1>&2
+		echo -n "Output file" 1>&2
+		ls -l "$f2" 1>&2
+		ret=1
+	fi
+}
+
 test_sctp_forward()
 {
 	ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
@@ -400,7 +422,7 @@ EOF
 
 	busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2"
 
-	ip netns exec "$nsrouter" ./nf_queue -q 10 -G -t "$timeout" &
+	ip netns exec "$nsrouter" ./nf_queue -q 10 -G &
 	local nfqpid=$!
 
 	ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
@@ -411,11 +433,9 @@ EOF
 	fi
 
 	wait "$rpid" && echo "PASS: sctp and nfqueue in forward chain"
+	kill "$nfqpid"
 
-	if ! diff -u "$TMPINPUT" "$TMPFILE1" ; then
-		echo "FAIL: lost packets?!" 1>&2
-		exit 1
-	fi
+	check_output_files "$TMPINPUT" "$TMPFILE1" "sctp forward"
 }
 
 test_sctp_output()
@@ -429,14 +449,14 @@ table inet sctpq {
 }
 EOF
 	# reduce test file size, software segmentation causes sk wmem increase.
-	dd conv=sparse status=none if=/dev/zero bs=1M count=50 of="$TMPINPUT"
+	dd conv=sparse status=none if=/dev/zero bs=1M count=$((COUNT/2)) of="$TMPINPUT"
 
 	timeout 60 ip netns exec "$ns2" socat -u SCTP-LISTEN:12345 STDOUT > "$TMPFILE1" &
 	local rpid=$!
 
 	busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2"
 
-	ip netns exec "$ns1" ./nf_queue -q 11 -t "$timeout" &
+	ip netns exec "$ns1" ./nf_queue -q 11 &
 	local nfqpid=$!
 
 	ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
@@ -448,11 +468,9 @@ EOF
 
 	# must wait before checking completeness of output file.
 	wait "$rpid" && echo "PASS: sctp and nfqueue in output chain with GSO"
+	kill "$nfqpid"
 
-	if ! diff -u "$TMPINPUT" "$TMPFILE1" ; then
-		echo "FAIL: lost packets?!" 1>&2
-		exit 1
-	fi
+	check_output_files "$TMPINPUT" "$TMPFILE1" "sctp output"
 }
 
 test_queue_removal()
@@ -468,7 +486,7 @@ table ip filter {
 	}
 }
 EOF
-	ip netns exec "$ns1" ./nf_queue -q 0 -d 30000 -t "$timeout" &
+	ip netns exec "$ns1" ./nf_queue -q 0 -d 30000 &
 	local nfqpid=$!
 
 	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 0
-- 
cgit v1.2.3


From c3f8644c21df9b7db97eb70e08e2826368aaafa0 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@google.com>
Date: Sat, 3 Aug 2024 17:20:06 +0200
Subject: perf report: Support LLVM for addr2line()

In addition to the existing support for libbfd and calling out to
an external addr2line command, add support for using libllvm directly.

This is both faster than libbfd, and can be enabled in distro builds
(the LLVM license has an explicit provision for GPLv2 compatibility).

Thus, it is set as the primary choice if available.

As an example, running 'perf report' on a medium-size profile with
DWARF-based backtraces took 58 seconds with LLVM, 78 seconds with
libbfd, 153 seconds with external llvm-addr2line, and I got tired and
aborted the test after waiting for 55 minutes with external bfd
addr2line (which is the default for perf as compiled by distributions
today).

Evidently, for this case, the bfd addr2line process needs 18 seconds (on
a 5.2 GHz Zen 3) to load the .debug ELF in question, hits the 1-second
timeout and gets killed during initialization, getting restarted anew
every time. Having an in-process addr2line makes this much more robust.

As future extensions, libllvm can be used in many other places where
we currently use libbfd or other libraries:

 - Symbol enumeration (in particular, for PE binaries).
 - Demangling (including non-Itanium demangling, e.g. Microsoft
   or Rust).
 - Disassembling (perf annotate).

However, these are much less pressing; most people don't profile PE
binaries, and perf has non-bfd paths for ELF. The same with demangling;
the default _cxa_demangle path works fine for most users, and while bfd
objdump can be slow on large binaries, it is possible to use
--objdump=llvm-objdump to get the speed benefits.  (It appears
LLVM-based demangling is very simple, should we want that.)

Tested with LLVM 14, 15, 16, 18 and 19. For some reason, LLVM 12 was not
correctly detected using feature_check, and thus was not tested.

Committer notes:

 Added the name and a __maybe_unused to address:

   1    13.50 almalinux:8                   : FAIL gcc version 8.5.0 20210514 (Red Hat 8.5.0-22) (GCC)
    util/srcline.c: In function 'dso__free_a2l':
    util/srcline.c:184:20: error: parameter name omitted
     void dso__free_a2l(struct dso *)
                        ^~~~~~~~~~~~
    make[3]: *** [/git/perf-6.11.0-rc3/tools/build/Makefile.build:158: util] Error 2

Signed-off-by: Steinar H. Gunderson <sesse@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20240803152008.2818485-1-sesse@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature       |   1 +
 tools/perf/Makefile.config         |  17 +++++
 tools/perf/builtin-version.c       |   1 +
 tools/perf/tests/make              |   2 +
 tools/perf/util/Build              |   1 +
 tools/perf/util/llvm-c-helpers.cpp | 134 +++++++++++++++++++++++++++++++++++++
 tools/perf/util/llvm-c-helpers.h   |  49 ++++++++++++++
 tools/perf/util/srcline.c          |  59 +++++++++++++++-
 8 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/util/llvm-c-helpers.cpp
 create mode 100644 tools/perf/util/llvm-c-helpers.h

(limited to 'tools')

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index e1900abd44f6..0717e96d6a0e 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -136,6 +136,7 @@ FEATURE_DISPLAY ?=              \
          libunwind              \
          libdw-dwarf-unwind     \
          libcapstone            \
+         llvm                   \
          zlib                   \
          lzma                   \
          get_cpuid              \
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 9998cd7a8792..7888c932b1b4 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -980,6 +980,23 @@ ifdef BUILD_NONDISTRO
   endif
 endif
 
+ifndef NO_LIBLLVM
+  $(call feature_check,llvm)
+  ifeq ($(feature-llvm), 1)
+    CFLAGS += -DHAVE_LIBLLVM_SUPPORT
+    CFLAGS += $(shell $(LLVM_CONFIG) --cflags)
+    CXXFLAGS += -DHAVE_LIBLLVM_SUPPORT
+    CXXFLAGS += $(shell $(LLVM_CONFIG) --cxxflags)
+    LIBLLVM = $(shell $(LLVM_CONFIG) --libs all) $(shell $(LLVM_CONFIG) --system-libs)
+    EXTLIBS += -L$(shell $(LLVM_CONFIG) --libdir) $(LIBLLVM)
+    EXTLIBS += -lstdc++
+    $(call detected,CONFIG_LIBLLVM)
+  else
+    $(warning No libllvm found, slower source file resolution, please install llvm-devel/llvm-dev)
+    NO_LIBLLVM := 1
+  endif
+endif
+
 ifndef NO_DEMANGLE
   $(call feature_check,cxa-demangle)
   ifeq ($(feature-cxa-demangle), 1)
diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c
index 398aa53e9e2e..4b252196de12 100644
--- a/tools/perf/builtin-version.c
+++ b/tools/perf/builtin-version.c
@@ -65,6 +65,7 @@ static void library_status(void)
 	STATUS(HAVE_LIBBFD_SUPPORT, libbfd);
 	STATUS(HAVE_DEBUGINFOD_SUPPORT, debuginfod);
 	STATUS(HAVE_LIBELF_SUPPORT, libelf);
+	STATUS(HAVE_LIBLLVM_SUPPORT, libllvm);
 	STATUS(HAVE_LIBNUMA_SUPPORT, libnuma);
 	STATUS(HAVE_LIBNUMA_SUPPORT, numa_num_possible_cpus);
 	STATUS(HAVE_LIBPERL_SUPPORT, libperl);
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 8a9edf758f10..a5040772043f 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -93,6 +93,7 @@ make_no_libbpf	    := NO_LIBBPF=1
 make_libbpf_dynamic := LIBBPF_DYNAMIC=1
 make_no_libbpf_DEBUG := NO_LIBBPF=1 DEBUG=1
 make_no_libcrypto   := NO_LIBCRYPTO=1
+make_no_libllvm     := NO_LIBLLVM=1
 make_with_babeltrace:= LIBBABELTRACE=1
 make_with_coresight := CORESIGHT=1
 make_no_sdt	    := NO_SDT=1
@@ -163,6 +164,7 @@ run += make_no_auxtrace
 run += make_no_libbpf
 run += make_no_libbpf_DEBUG
 run += make_no_libcrypto
+run += make_no_libllvm
 run += make_no_sdt
 run += make_no_syscall_tbl
 run += make_with_babeltrace
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 260cec2f6c0b..dc616292b2dd 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -229,6 +229,7 @@ perf-util-$(CONFIG_CXX_DEMANGLE) += demangle-cxx.o
 perf-util-y += demangle-ocaml.o
 perf-util-y += demangle-java.o
 perf-util-y += demangle-rust.o
+perf-util-$(CONFIG_LIBLLVM) += llvm-c-helpers.o
 
 ifdef CONFIG_JITDUMP
 perf-util-$(CONFIG_LIBELF) += jitdump.o
diff --git a/tools/perf/util/llvm-c-helpers.cpp b/tools/perf/util/llvm-c-helpers.cpp
new file mode 100644
index 000000000000..3cc967ec6f28
--- /dev/null
+++ b/tools/perf/util/llvm-c-helpers.cpp
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Must come before the linux/compiler.h include, which defines several
+ * macros (e.g. noinline) that conflict with compiler builtins used
+ * by LLVM.
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"  /* Needed for LLVM <= 15 */
+#include <llvm/DebugInfo/Symbolize/Symbolize.h>
+#pragma GCC diagnostic pop
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <linux/compiler.h>
+extern "C" {
+#include <linux/zalloc.h>
+}
+#include "symbol_conf.h"
+#include "llvm-c-helpers.h"
+
+using namespace llvm;
+using llvm::symbolize::LLVMSymbolizer;
+
+/*
+ * Allocate a static LLVMSymbolizer, which will live to the end of the program.
+ * Unlike the bfd paths, LLVMSymbolizer has its own cache, so we do not need
+ * to store anything in the dso struct.
+ */
+static LLVMSymbolizer *get_symbolizer()
+{
+	static LLVMSymbolizer *instance = nullptr;
+	if (instance == nullptr) {
+		LLVMSymbolizer::Options opts;
+		/*
+		 * LLVM sometimes demangles slightly different from the rest
+		 * of the code, and this mismatch can cause new_inline_sym()
+		 * to get confused and mark non-inline symbol as inlined
+		 * (since the name does not properly match up with base_sym).
+		 * Thus, disable the demangling and let the rest of the code
+		 * handle it.
+		 */
+		opts.Demangle = false;
+		instance = new LLVMSymbolizer(opts);
+	}
+	return instance;
+}
+
+/* Returns 0 on error, 1 on success. */
+static int extract_file_and_line(const DILineInfo &line_info, char **file,
+				 unsigned int *line)
+{
+	if (file) {
+		if (line_info.FileName == "<invalid>") {
+			/* Match the convention of libbfd. */
+			*file = nullptr;
+		} else {
+			/* The caller expects to get something it can free(). */
+			*file = strdup(line_info.FileName.c_str());
+			if (*file == nullptr)
+				return 0;
+		}
+	}
+	if (line)
+		*line = line_info.Line;
+	return 1;
+}
+
+extern "C"
+int llvm_addr2line(const char *dso_name, u64 addr,
+		   char **file, unsigned int *line,
+		   bool unwind_inlines,
+		   llvm_a2l_frame **inline_frames)
+{
+	LLVMSymbolizer *symbolizer = get_symbolizer();
+	object::SectionedAddress sectioned_addr = {
+		addr,
+		object::SectionedAddress::UndefSection
+	};
+
+	if (unwind_inlines) {
+		Expected<DIInliningInfo> res_or_err =
+			symbolizer->symbolizeInlinedCode(dso_name,
+							 sectioned_addr);
+		if (!res_or_err)
+			return 0;
+		unsigned num_frames = res_or_err->getNumberOfFrames();
+		if (num_frames == 0)
+			return 0;
+
+		if (extract_file_and_line(res_or_err->getFrame(0),
+					  file, line) == 0)
+			return 0;
+
+		*inline_frames = (llvm_a2l_frame *)calloc(
+			num_frames, sizeof(**inline_frames));
+		if (*inline_frames == nullptr)
+			return 0;
+
+		for (unsigned i = 0; i < num_frames; ++i) {
+			const DILineInfo &src = res_or_err->getFrame(i);
+
+			llvm_a2l_frame &dst = (*inline_frames)[i];
+			if (src.FileName == "<invalid>")
+				/* Match the convention of libbfd. */
+				dst.filename = nullptr;
+			else
+				dst.filename = strdup(src.FileName.c_str());
+			dst.funcname = strdup(src.FunctionName.c_str());
+			dst.line = src.Line;
+
+			if (dst.filename == nullptr ||
+			    dst.funcname == nullptr) {
+				for (unsigned j = 0; j <= i; ++j) {
+					zfree(&(*inline_frames)[j].filename);
+					zfree(&(*inline_frames)[j].funcname);
+				}
+				zfree(inline_frames);
+				return 0;
+			}
+		}
+
+		return num_frames;
+	} else {
+		if (inline_frames)
+			*inline_frames = nullptr;
+
+		Expected<DILineInfo> res_or_err =
+			symbolizer->symbolizeCode(dso_name, sectioned_addr);
+		if (!res_or_err)
+			return 0;
+		return extract_file_and_line(*res_or_err, file, line);
+	}
+}
diff --git a/tools/perf/util/llvm-c-helpers.h b/tools/perf/util/llvm-c-helpers.h
new file mode 100644
index 000000000000..19332dd98e14
--- /dev/null
+++ b/tools/perf/util/llvm-c-helpers.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_LLVM_C_HELPERS
+#define __PERF_LLVM_C_HELPERS 1
+
+/*
+ * Helpers to call into LLVM C++ code from C, for the parts that do not have
+ * C APIs.
+ */
+
+#include <linux/compiler.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct llvm_a2l_frame {
+  char* filename;
+  char* funcname;
+  unsigned int line;
+};
+
+/*
+ * Implement addr2line() using libLLVM. LLVM is a C++ API, and
+ * many of the linux/ headers cannot be included in a C++ compile unit,
+ * so we need to make a little bridge code here. llvm_addr2line() will
+ * convert the inline frame information from LLVM's internal structures
+ * and put them into a flat array given in inline_frames. The caller
+ * is then responsible for taking that array and convert it into perf's
+ * regular inline frame structures (which depend on e.g. struct list_head).
+ *
+ * If the address could not be resolved, or an error occurred (e.g. OOM),
+ * returns 0. Otherwise, returns the number of inline frames (which means 1
+ * if the address was not part of an inlined function). If unwind_inlines
+ * is set and the return code is nonzero, inline_frames will be set to
+ * a newly allocated array with that length. The caller is then responsible
+ * for freeing both the strings and the array itself.
+ */
+int llvm_addr2line(const char* dso_name,
+                   u64 addr,
+                   char** file,
+                   unsigned int* line,
+                   bool unwind_inlines,
+                   struct llvm_a2l_frame** inline_frames);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __PERF_LLVM_C_HELPERS */
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index 760742fd4a7d..f32d0d4f4bc9 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -6,6 +6,7 @@
 #include <string.h>
 #include <sys/types.h>
 
+#include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/zalloc.h>
@@ -16,6 +17,9 @@
 #include "util/debug.h"
 #include "util/callchain.h"
 #include "util/symbol_conf.h"
+#ifdef HAVE_LIBLLVM_SUPPORT
+#include "util/llvm-c-helpers.h"
+#endif
 #include "srcline.h"
 #include "string2.h"
 #include "symbol.h"
@@ -130,7 +134,60 @@ static struct symbol *new_inline_sym(struct dso *dso,
 
 #define MAX_INLINE_NEST 1024
 
-#ifdef HAVE_LIBBFD_SUPPORT
+#ifdef HAVE_LIBLLVM_SUPPORT
+
+static void free_llvm_inline_frames(struct llvm_a2l_frame *inline_frames,
+				    int num_frames)
+{
+	if (inline_frames != NULL) {
+		for (int i = 0; i < num_frames; ++i) {
+			zfree(&inline_frames[i].filename);
+			zfree(&inline_frames[i].funcname);
+		}
+		zfree(&inline_frames);
+	}
+}
+
+static int addr2line(const char *dso_name, u64 addr,
+		     char **file, unsigned int *line, struct dso *dso,
+		     bool unwind_inlines, struct inline_node *node,
+		     struct symbol *sym)
+{
+	struct llvm_a2l_frame *inline_frames = NULL;
+	int num_frames = llvm_addr2line(dso_name, addr, file, line,
+					node && unwind_inlines, &inline_frames);
+
+	if (num_frames == 0 || !inline_frames) {
+		/* Error, or we didn't want inlines. */
+		return num_frames;
+	}
+
+	for (int i = 0; i < num_frames; ++i) {
+		struct symbol *inline_sym =
+			new_inline_sym(dso, sym, inline_frames[i].funcname);
+		char *srcline = NULL;
+
+		if (inline_frames[i].filename) {
+			srcline =
+				srcline_from_fileline(inline_frames[i].filename,
+						      inline_frames[i].line);
+		}
+		if (inline_list__append(inline_sym, srcline, node) != 0) {
+			free_llvm_inline_frames(inline_frames, num_frames);
+			return 0;
+		}
+	}
+	free_llvm_inline_frames(inline_frames, num_frames);
+
+	return num_frames;
+}
+
+void dso__free_a2l(struct dso *dso __maybe_unused)
+{
+	/* Nothing to free. */
+}
+
+#elif defined(HAVE_LIBBFD_SUPPORT)
 
 /*
  * Implement addr2line using libbfd.
-- 
cgit v1.2.3


From 6eca7c5ac23effd552d6f03acc5ce0efc1ed1c1e Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@google.com>
Date: Sat, 3 Aug 2024 17:20:07 +0200
Subject: perf annotate: Split out read_symbol()

The Capstone disassembler code has a useful code snippet to read the
bytes for a given code symbol into memory. Split it out into its own
function, so that the LLVM disassembler can use it in the next patch.

Signed-off-by: Steinar H. Gunderson <sesse@google.com>
Cc: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20240803152008.2818485-2-sesse@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c | 90 ++++++++++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index d11e75133b23..5e9be3487c6c 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -1372,6 +1372,53 @@ static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg)
 	return 0;
 }
 
+static u8 *
+read_symbol(const char *filename, struct map *map, struct symbol *sym,
+	    u64 *len, bool *is_64bit)
+{
+	struct dso *dso = map__dso(map);
+	struct nscookie nsc;
+	u64 start = map__rip_2objdump(map, sym->start);
+	u64 end = map__rip_2objdump(map, sym->end);
+	int fd, count;
+	u8 *buf = NULL;
+	struct find_file_offset_data data = {
+		.ip = start,
+	};
+
+	*is_64bit = false;
+
+	nsinfo__mountns_enter(dso__nsinfo(dso), &nsc);
+	fd = open(filename, O_RDONLY);
+	nsinfo__mountns_exit(&nsc);
+	if (fd < 0)
+		return NULL;
+
+	if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data,
+			    is_64bit) == 0)
+		goto err;
+
+	*len = end - start;
+	buf = malloc(*len);
+	if (buf == NULL)
+		goto err;
+
+	count = pread(fd, buf, *len, data.offset);
+	close(fd);
+	fd = -1;
+
+	if ((u64)count != *len)
+		goto err;
+
+	return buf;
+
+err:
+	if (fd >= 0)
+		close(fd);
+	free(buf);
+	return NULL;
+}
+
 static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
 				  struct annotate_args *args, u64 addr)
 {
@@ -1572,19 +1619,13 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym,
 {
 	struct annotation *notes = symbol__annotation(sym);
 	struct map *map = args->ms.map;
-	struct dso *dso = map__dso(map);
-	struct nscookie nsc;
 	u64 start = map__rip_2objdump(map, sym->start);
-	u64 end = map__rip_2objdump(map, sym->end);
-	u64 len = end - start;
+	u64 len;
 	u64 offset;
-	int i, fd, count;
+	int i, count;
 	bool is_64bit = false;
 	bool needs_cs_close = false;
 	u8 *buf = NULL;
-	struct find_file_offset_data data = {
-		.ip = start,
-	};
 	csh handle;
 	cs_insn *insn;
 	char disasm_buf[512];
@@ -1593,31 +1634,9 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym,
 	if (args->options->objdump_path)
 		return -1;
 
-	nsinfo__mountns_enter(dso__nsinfo(dso), &nsc);
-	fd = open(filename, O_RDONLY);
-	nsinfo__mountns_exit(&nsc);
-	if (fd < 0)
-		return -1;
-
-	if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data,
-			    &is_64bit) == 0)
-		goto err;
-
-	if (open_capstone_handle(args, is_64bit, &handle) < 0)
-		goto err;
-
-	needs_cs_close = true;
-
-	buf = malloc(len);
+	buf = read_symbol(filename, map, sym, &len, &is_64bit);
 	if (buf == NULL)
-		goto err;
-
-	count = pread(fd, buf, len, data.offset);
-	close(fd);
-	fd = -1;
-
-	if ((u64)count != len)
-		goto err;
+		return -1;
 
 	/* add the function address and name */
 	scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:",
@@ -1635,6 +1654,11 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym,
 
 	annotation_line__add(&dl->al, &notes->src->source);
 
+	if (open_capstone_handle(args, is_64bit, &handle) < 0)
+		goto err;
+
+	needs_cs_close = true;
+
 	count = cs_disasm(handle, buf, len, start, len, &insn);
 	for (i = 0, offset = 0; i < count; i++) {
 		int printed;
@@ -1679,8 +1703,6 @@ out:
 	return count < 0 ? count : 0;
 
 err:
-	if (fd >= 0)
-		close(fd);
 	if (needs_cs_close) {
 		struct disasm_line *tmp;
 
-- 
cgit v1.2.3


From 04885681788855bb266fd131f06c04952a717659 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@google.com>
Date: Sat, 3 Aug 2024 17:20:08 +0200
Subject: perf annotate: LLVM-based disassembler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support using LLVM as a disassembler method, allowing helperless
annotation in non-distro builds. (It is also much faster than
using libbfd or bfd objdump on binaries with a lot of debug
information.)

This is nearly identical to the output of llvm-objdump; there are
some very rare whitespace differences, some minor changes to demangling
(since we use perf's regular demangling and not LLVM's own) and
the occasional case where llvm-objdump makes a different choice
when multiple symbols share the same address.

It should work across all of LLVM's supported architectures, although
I've only tested 64-bit x86, and finding the right triple from perf's
idea of machine architecture can sometimes be a bit tricky. Ideally, we
should have some way of finding the triplet just from the file itself.

Committer notes:

Address this on 32-bit systems by using PRIu64 from inttypes.h

     3    17.58 almalinux:9-i386              : FAIL gcc version 11.4.1 20231218 (Red Hat 11.4.1-3) (GCC)
      util/llvm-c-helpers.cpp: In function ‘char* make_symbol_relative_string(dso*, const char*, u64, u64)’:
      util/llvm-c-helpers.cpp:150:52: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 5 has type ‘u64’ {aka
  +‘long long unsigned int’} [-Werror=format=]
        150 |                 snprintf(buf, sizeof(buf), "%s+0x%lx",
            |                                                  ~~^
            |                                                    |
            |                                                    long unsigned int
            |                                                  %llx
        151 |                          demangled ? demangled : sym_name, addr - base_addr);
            |                                                            ~~~~~~~~~~~~~~~~
            |                                                                 |
            |                                                                 u64 {aka long long unsigned int}
      cc1plus: all warnings being treated as errors

Signed-off-by: Steinar H. Gunderson <sesse@google.com>
Cc: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20240803152008.2818485-3-sesse@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c           | 193 +++++++++++++++++++++++++++++++++++++
 tools/perf/util/llvm-c-helpers.cpp |  63 ++++++++++++
 tools/perf/util/llvm-c-helpers.h   |  11 +++
 3 files changed, 267 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 5e9be3487c6c..f05ba7739c1e 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -48,6 +48,7 @@ static int call__scnprintf(struct ins *ins, char *bf, size_t size,
 static void ins__sort(struct arch *arch);
 static int disasm_line__parse(char *line, const char **namep, char **rawp);
 static int disasm_line__parse_powerpc(struct disasm_line *dl);
+static char *expand_tabs(char *line, char **storage, size_t *storage_len);
 
 static __attribute__((constructor)) void symbol__init_regexpr(void)
 {
@@ -1354,7 +1355,9 @@ static int open_capstone_handle(struct annotate_args *args, bool is_64bit,
 
 	return 0;
 }
+#endif
 
+#if defined(HAVE_LIBCAPSTONE_SUPPORT) || defined(HAVE_LIBLLVM_SUPPORT)
 struct find_file_offset_data {
 	u64 ip;
 	u64 offset;
@@ -1418,7 +1421,9 @@ err:
 	free(buf);
 	return NULL;
 }
+#endif
 
+#ifdef HAVE_LIBCAPSTONE_SUPPORT
 static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
 				  struct annotate_args *args, u64 addr)
 {
@@ -1805,6 +1810,189 @@ err:
 	count = -1;
 	goto out;
 }
+
+#ifdef HAVE_LIBLLVM_SUPPORT
+#include <llvm-c/Disassembler.h>
+#include <llvm-c/Target.h>
+#include "util/llvm-c-helpers.h"
+
+struct symbol_lookup_storage {
+	u64 branch_addr;
+	u64 pcrel_load_addr;
+};
+
+/*
+ * Whenever LLVM wants to resolve an address into a symbol, it calls this
+ * callback. We don't ever actually _return_ anything (in particular, because
+ * it puts quotation marks around what we return), but we use this as a hint
+ * that there is a branch or PC-relative address in the expression that we
+ * should add some textual annotation for after the instruction. The caller
+ * will use this information to add the actual annotation.
+ */
+static const char *
+symbol_lookup_callback(void *disinfo, uint64_t value,
+		       uint64_t *ref_type,
+		       uint64_t address __maybe_unused,
+		       const char **ref __maybe_unused)
+{
+	struct symbol_lookup_storage *storage = disinfo;
+
+	if (*ref_type == LLVMDisassembler_ReferenceType_In_Branch)
+		storage->branch_addr = value;
+	else if (*ref_type == LLVMDisassembler_ReferenceType_In_PCrel_Load)
+		storage->pcrel_load_addr = value;
+	*ref_type = LLVMDisassembler_ReferenceType_InOut_None;
+	return NULL;
+}
+
+static int symbol__disassemble_llvm(char *filename, struct symbol *sym,
+				    struct annotate_args *args)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct map *map = args->ms.map;
+	struct dso *dso = map__dso(map);
+	u64 start = map__rip_2objdump(map, sym->start);
+	u8 *buf;
+	u64 len;
+	u64 pc;
+	bool is_64bit;
+	char triplet[64];
+	char disasm_buf[2048];
+	size_t disasm_len;
+	struct disasm_line *dl;
+	LLVMDisasmContextRef disasm = NULL;
+	struct symbol_lookup_storage storage;
+	char *line_storage = NULL;
+	size_t line_storage_len = 0;
+	int ret = -1;
+
+	if (args->options->objdump_path)
+		return -1;
+
+	LLVMInitializeAllTargetInfos();
+	LLVMInitializeAllTargetMCs();
+	LLVMInitializeAllDisassemblers();
+
+	buf = read_symbol(filename, map, sym, &len, &is_64bit);
+	if (buf == NULL)
+		return -1;
+
+	if (arch__is(args->arch, "x86")) {
+		if (is_64bit)
+			scnprintf(triplet, sizeof(triplet), "x86_64-pc-linux");
+		else
+			scnprintf(triplet, sizeof(triplet), "i686-pc-linux");
+	} else {
+		scnprintf(triplet, sizeof(triplet), "%s-linux-gnu",
+			  args->arch->name);
+	}
+
+	disasm = LLVMCreateDisasm(triplet, &storage, 0, NULL,
+				  symbol_lookup_callback);
+	if (disasm == NULL)
+		goto err;
+
+	if (args->options->disassembler_style &&
+	    !strcmp(args->options->disassembler_style, "intel"))
+		LLVMSetDisasmOptions(disasm,
+				     LLVMDisassembler_Option_AsmPrinterVariant);
+
+	/*
+	 * This needs to be set after AsmPrinterVariant, due to a bug in LLVM;
+	 * setting AsmPrinterVariant makes a new instruction printer, making it
+	 * forget about the PrintImmHex flag (which is applied before if both
+	 * are given to the same call).
+	 */
+	LLVMSetDisasmOptions(disasm, LLVMDisassembler_Option_PrintImmHex);
+
+	/* add the function address and name */
+	scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:",
+		  start, sym->name);
+
+	args->offset = -1;
+	args->line = disasm_buf;
+	args->line_nr = 0;
+	args->fileloc = NULL;
+	args->ms.sym = sym;
+
+	dl = disasm_line__new(args);
+	if (dl == NULL)
+		goto err;
+
+	annotation_line__add(&dl->al, &notes->src->source);
+
+	pc = start;
+	for (u64 offset = 0; offset < len; ) {
+		unsigned int ins_len;
+
+		storage.branch_addr = 0;
+		storage.pcrel_load_addr = 0;
+
+		ins_len = LLVMDisasmInstruction(disasm, buf + offset,
+						len - offset, pc,
+						disasm_buf, sizeof(disasm_buf));
+		if (ins_len == 0)
+			goto err;
+		disasm_len = strlen(disasm_buf);
+
+		if (storage.branch_addr != 0) {
+			char *name = llvm_name_for_code(dso, filename,
+							storage.branch_addr);
+			if (name != NULL) {
+				disasm_len += scnprintf(disasm_buf + disasm_len,
+							sizeof(disasm_buf) -
+								disasm_len,
+							" <%s>", name);
+				free(name);
+			}
+		}
+		if (storage.pcrel_load_addr != 0) {
+			char *name = llvm_name_for_data(dso, filename,
+							storage.pcrel_load_addr);
+			disasm_len += scnprintf(disasm_buf + disasm_len,
+						sizeof(disasm_buf) - disasm_len,
+						"  # %#"PRIx64,
+						storage.pcrel_load_addr);
+			if (name) {
+				disasm_len += scnprintf(disasm_buf + disasm_len,
+							sizeof(disasm_buf) -
+							disasm_len,
+							" <%s>", name);
+				free(name);
+			}
+		}
+
+		args->offset = offset;
+		args->line = expand_tabs(disasm_buf, &line_storage,
+					 &line_storage_len);
+		args->line_nr = 0;
+		args->fileloc = NULL;
+		args->ms.sym = sym;
+
+		llvm_addr2line(filename, pc, &args->fileloc,
+			       (unsigned int *)&args->line_nr, false, NULL);
+
+		dl = disasm_line__new(args);
+		if (dl == NULL)
+			goto err;
+
+		annotation_line__add(&dl->al, &notes->src->source);
+
+		free(args->fileloc);
+		pc += ins_len;
+		offset += ins_len;
+	}
+
+	ret = 0;
+
+err:
+	LLVMDisasmDispose(disasm);
+	free(buf);
+	free(line_storage);
+	return ret;
+}
+#endif
+
 /*
  * Possibly create a new version of line with tabs expanded. Returns the
  * existing or new line, storage is updated if a new line is allocated. If
@@ -1951,6 +2139,11 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 		}
 	}
 
+#ifdef HAVE_LIBLLVM_SUPPORT
+	err = symbol__disassemble_llvm(symfs_filename, sym, args);
+	if (err == 0)
+		goto out_remove_tmp;
+#endif
 #ifdef HAVE_LIBCAPSTONE_SUPPORT
 	err = symbol__disassemble_capstone(symfs_filename, sym, args);
 	if (err == 0)
diff --git a/tools/perf/util/llvm-c-helpers.cpp b/tools/perf/util/llvm-c-helpers.cpp
index 3cc967ec6f28..663bcaba2041 100644
--- a/tools/perf/util/llvm-c-helpers.cpp
+++ b/tools/perf/util/llvm-c-helpers.cpp
@@ -8,8 +8,10 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"  /* Needed for LLVM <= 15 */
 #include <llvm/DebugInfo/Symbolize/Symbolize.h>
+#include <llvm/Support/TargetSelect.h>
 #pragma GCC diagnostic pop
 
+#include <inttypes.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <linux/compiler.h>
@@ -19,6 +21,9 @@ extern "C" {
 #include "symbol_conf.h"
 #include "llvm-c-helpers.h"
 
+extern "C"
+char *dso__demangle_sym(struct dso *dso, int kmodule, const char *elf_name);
+
 using namespace llvm;
 using llvm::symbolize::LLVMSymbolizer;
 
@@ -132,3 +137,61 @@ int llvm_addr2line(const char *dso_name, u64 addr,
 		return extract_file_and_line(*res_or_err, file, line);
 	}
 }
+
+static char *
+make_symbol_relative_string(struct dso *dso, const char *sym_name,
+			    u64 addr, u64 base_addr)
+{
+	if (!strcmp(sym_name, "<invalid>"))
+		return NULL;
+
+	char *demangled = dso__demangle_sym(dso, 0, sym_name);
+	if (base_addr && base_addr != addr) {
+		char buf[256];
+		snprintf(buf, sizeof(buf), "%s+0x%" PRIx64,
+			 demangled ? demangled : sym_name, addr - base_addr);
+		free(demangled);
+		return strdup(buf);
+	} else {
+		if (demangled)
+			return demangled;
+		else
+			return strdup(sym_name);
+	}
+}
+
+extern "C"
+char *llvm_name_for_code(struct dso *dso, const char *dso_name, u64 addr)
+{
+	LLVMSymbolizer *symbolizer = get_symbolizer();
+	object::SectionedAddress sectioned_addr = {
+		addr,
+		object::SectionedAddress::UndefSection
+	};
+	Expected<DILineInfo> res_or_err =
+		symbolizer->symbolizeCode(dso_name, sectioned_addr);
+	if (!res_or_err) {
+		return NULL;
+	}
+	return make_symbol_relative_string(
+		dso, res_or_err->FunctionName.c_str(),
+		addr, res_or_err->StartAddress ? *res_or_err->StartAddress : 0);
+}
+
+extern "C"
+char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr)
+{
+	LLVMSymbolizer *symbolizer = get_symbolizer();
+	object::SectionedAddress sectioned_addr = {
+		addr,
+		object::SectionedAddress::UndefSection
+	};
+	Expected<DIGlobal> res_or_err =
+		symbolizer->symbolizeData(dso_name, sectioned_addr);
+	if (!res_or_err) {
+		return NULL;
+	}
+	return make_symbol_relative_string(
+		dso, res_or_err->Name.c_str(),
+		addr, res_or_err->Start);
+}
diff --git a/tools/perf/util/llvm-c-helpers.h b/tools/perf/util/llvm-c-helpers.h
index 19332dd98e14..d2b99637a28a 100644
--- a/tools/perf/util/llvm-c-helpers.h
+++ b/tools/perf/util/llvm-c-helpers.h
@@ -13,6 +13,8 @@
 extern "C" {
 #endif
 
+struct dso;
+
 struct llvm_a2l_frame {
   char* filename;
   char* funcname;
@@ -42,6 +44,15 @@ int llvm_addr2line(const char* dso_name,
                    bool unwind_inlines,
                    struct llvm_a2l_frame** inline_frames);
 
+/*
+ * Simple symbolizers for addresses; will convert something like
+ * 0x12345 to "func+0x123". Will return NULL if no symbol was found.
+ *
+ * The returned value must be freed by the caller, with free().
+ */
+char *llvm_name_for_code(struct dso *dso, const char *dso_name, u64 addr);
+char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr);
+
 #ifdef __cplusplus
 }
 #endif
-- 
cgit v1.2.3


From 6c99903e084c68887f7bc52a6819117f26820667 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 31 Aug 2024 00:04:10 -0700
Subject: perf pmus: Fix name comparisons on 32-bit systems

The hex PMU suffix maybe 64-bit but the comparisons were "unsigned
long" or 32-bit on 32-bit systems. This was causing the "PMU name
comparison" test to fail in a 32-bit build.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Link: https://lore.kernel.org/r/20240831070415.506194-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/pmus.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
index 3fcabfd8fca1..769b920d9250 100644
--- a/tools/perf/util/pmus.c
+++ b/tools/perf/util/pmus.c
@@ -69,7 +69,7 @@ size_t pmu_name_len_no_suffix(const char *str)
 
 int pmu_name_cmp(const char *lhs_pmu_name, const char *rhs_pmu_name)
 {
-	unsigned long lhs_num = 0, rhs_num = 0;
+	unsigned long long lhs_num = 0, rhs_num = 0;
 	size_t lhs_pmu_name_len = pmu_name_len_no_suffix(lhs_pmu_name);
 	size_t rhs_pmu_name_len = pmu_name_len_no_suffix(rhs_pmu_name);
 	int ret = strncmp(lhs_pmu_name, rhs_pmu_name,
@@ -79,9 +79,9 @@ int pmu_name_cmp(const char *lhs_pmu_name, const char *rhs_pmu_name)
 		return ret;
 
 	if (lhs_pmu_name_len + 1 < strlen(lhs_pmu_name))
-		lhs_num = strtoul(&lhs_pmu_name[lhs_pmu_name_len + 1], NULL, 16);
+		lhs_num = strtoull(&lhs_pmu_name[lhs_pmu_name_len + 1], NULL, 16);
 	if (rhs_pmu_name_len + 1 < strlen(rhs_pmu_name))
-		rhs_num = strtoul(&rhs_pmu_name[rhs_pmu_name_len + 1], NULL, 16);
+		rhs_num = strtoull(&rhs_pmu_name[rhs_pmu_name_len + 1], NULL, 16);
 
 	return lhs_num < rhs_num ? -1 : (lhs_num > rhs_num ? 1 : 0);
 }
-- 
cgit v1.2.3


From 38e2648a81204c9fc5b4c87a8ffce93a6ed91b65 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 31 Aug 2024 00:04:11 -0700
Subject: perf time-utils: Fix 32-bit nsec parsing

The "time utils" test fails in 32-bit builds:
  ...
  parse_nsec_time("18446744073.709551615")
  Failed. ptime 4294967295709551615 expected 18446744073709551615
  ...

Switch strtoul to strtoull as an unsigned long in 32-bit build isn't
64-bits.

Fixes: c284d669a20d408b ("perf tools: Move parse_nsec_time to time-utils.c")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Link: https://lore.kernel.org/r/20240831070415.506194-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/time-utils.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c
index 302443921681..1b91ccd4d523 100644
--- a/tools/perf/util/time-utils.c
+++ b/tools/perf/util/time-utils.c
@@ -20,7 +20,7 @@ int parse_nsec_time(const char *str, u64 *ptime)
 	u64 time_sec, time_nsec;
 	char *end;
 
-	time_sec = strtoul(str, &end, 10);
+	time_sec = strtoull(str, &end, 10);
 	if (*end != '.' && *end != '\0')
 		return -1;
 
@@ -38,7 +38,7 @@ int parse_nsec_time(const char *str, u64 *ptime)
 		for (i = strlen(nsec_buf); i < 9; i++)
 			nsec_buf[i] = '0';
 
-		time_nsec = strtoul(nsec_buf, &end, 10);
+		time_nsec = strtoull(nsec_buf, &end, 10);
 		if (*end != '\0')
 			return -1;
 	} else
-- 
cgit v1.2.3


From 91235380e5c78dc02f18d6afa97dff82ab3a6887 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 31 Aug 2024 00:04:12 -0700
Subject: perf test: Skip uprobe test if probe command isn't present

The probe command is dependent on libelf. Skip the test if the
required probe command isn't present.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Link: https://lore.kernel.org/r/20240831070415.506194-4-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/test_uprobe_from_different_cu.sh | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/test_uprobe_from_different_cu.sh b/tools/perf/tests/shell/test_uprobe_from_different_cu.sh
index 82bc774a078a..33387c329f92 100755
--- a/tools/perf/tests/shell/test_uprobe_from_different_cu.sh
+++ b/tools/perf/tests/shell/test_uprobe_from_different_cu.sh
@@ -4,6 +4,13 @@
 
 set -e
 
+# Skip if there's no probe command.
+if ! perf | grep probe
+then
+        echo "Skip: probe command isn't present"
+        exit 2
+fi
+
 # skip if there's no gcc
 if ! [ -x "$(command -v gcc)" ]; then
         echo "failed: no gcc compiler"
-- 
cgit v1.2.3


From 18f41f1ba5404cb4ca17590ba28258f8358f8695 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 31 Aug 2024 00:04:15 -0700
Subject: perf test: Make watchpoint data 32-bits on i386

i386 only supports watchpoints up to size 4, 8 bytes causes extra
counts and test failures.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Link: https://lore.kernel.org/r/20240831070415.506194-7-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/wp.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/wp.c b/tools/perf/tests/wp.c
index cc8719609b19..6c178985e37f 100644
--- a/tools/perf/tests/wp.c
+++ b/tools/perf/tests/wp.c
@@ -20,7 +20,12 @@ do {                                            \
 	TEST_ASSERT_VAL(text, count == val);    \
 } while (0)
 
+#ifdef __i386__
+/* Only breakpoint length less-than 8 has hardware support on i386. */
+volatile u32 data1;
+#else
 volatile u64 data1;
+#endif
 volatile u8 data2[3];
 
 #ifndef __s390x__
-- 
cgit v1.2.3


From 76d3685400944e63c2c6e791a72d5329c2843770 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 2 Sep 2024 13:05:11 -0700
Subject: perf stat: Constify control data for BPF

The control knobs set before loading BPF programs should be declared as
'const volatile' so that it can be optimized by the BPF core.

Committer testing:

  root@x1:~# perf stat --bpf-counters -e cpu_core/cycles/,cpu_core/instructions/ sleep 1

   Performance counter stats for 'sleep 1':

           2,442,583      cpu_core/cycles/
           2,494,425      cpu_core/instructions/

         1.002687372 seconds time elapsed

         0.001126000 seconds user
         0.001166000 seconds sys

  root@x1:~# perf trace -e bpf --max-events 10 perf stat --bpf-counters -e cpu_core/cycles/,cpu_core/instructions/ sleep 1
       0.000 ( 0.019 ms): perf/2944119 bpf(cmd: OBJ_GET, uattr: 0x7fffdf5cdd40, size: 20)            = 5
       0.021 ( 0.002 ms): perf/2944119 bpf(cmd: OBJ_GET_INFO_BY_FD, uattr: 0x7fffdf5cdcd0, size: 16) = 0
       0.030 ( 0.005 ms): perf/2944119 bpf(cmd: MAP_LOOKUP_ELEM, uattr: 0x7fffdf5ceda0, size: 32)    = 0
       0.037 ( 0.004 ms): perf/2944119 bpf(cmd: LINK_GET_FD_BY_ID, uattr: 0x7fffdf5ced80, size: 12)  = -1 ENOENT (No such file or directory)
       0.189 ( 0.004 ms): perf/2944119 bpf(cmd: 36, uattr: 0x7fffdf5cec10, size: 8)                  = -1 EOPNOTSUPP (Operation not supported)
       0.201 ( 0.095 ms): perf/2944119 bpf(cmd: PROG_LOAD, uattr: 0x7fffdf5ce940, size: 148)         = 10
       0.305 ( 0.026 ms): perf/2944119 bpf(cmd: PROG_LOAD, uattr: 0x7fffdf5cea00, size: 148)         = 10
       0.347 ( 0.012 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce8e0, size: 40)           = 10
       0.364 ( 0.004 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce950, size: 40)           = 10
       0.376 ( 0.006 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce730, size: 40)           = 10
  root@x1:~#
   Performance counter stats for 'sleep 1':

             271,221      cpu_core/cycles/
             139,150      cpu_core/instructions/

         1.002881677 seconds time elapsed

         0.001318000 seconds user
         0.001314000 seconds sys

  root@x1:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240902200515.2103769-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_counter_cgroup.c        | 6 +++---
 tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c
index ea29c372f339..6ff42619de12 100644
--- a/tools/perf/util/bpf_counter_cgroup.c
+++ b/tools/perf/util/bpf_counter_cgroup.c
@@ -61,6 +61,9 @@ static int bperf_load_program(struct evlist *evlist)
 	skel->rodata->num_cpus = total_cpus;
 	skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups;
 
+	if (cgroup_is_v2("perf_event") > 0)
+		skel->rodata->use_cgroup_v2 = 1;
+
 	BUG_ON(evlist->core.nr_entries % nr_cgroups != 0);
 
 	/* we need one copy of events per cpu for reading */
@@ -82,9 +85,6 @@ static int bperf_load_program(struct evlist *evlist)
 		goto out;
 	}
 
-	if (cgroup_is_v2("perf_event") > 0)
-		skel->bss->use_cgroup_v2 = 1;
-
 	err = -1;
 
 	cgrp_switch = evsel__new(&cgrp_switch_attr);
diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
index 6a438e0102c5..57cab7647a9a 100644
--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -57,9 +57,9 @@ struct cgroup___old {
 
 const volatile __u32 num_events = 1;
 const volatile __u32 num_cpus = 1;
+const volatile int use_cgroup_v2 = 0;
 
 int enabled = 0;
-int use_cgroup_v2 = 0;
 int perf_subsys_id = -1;
 
 static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
-- 
cgit v1.2.3


From ac5a23b2f2868c54fdd2fc84c1990ebeb1e06188 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 2 Sep 2024 13:05:12 -0700
Subject: perf ftrace latency: Constify control data for BPF

The control knobs set before loading BPF programs should be declared as
'const volatile' so that it can be optimized by the BPF core.

Committer testing:

  root@x1:~# perf ftrace latency --use-bpf -T schedule
  ^C#   DURATION     |      COUNT | GRAPH                                          |
       0 - 1    us |          0 |                                                |
       1 - 2    us |          0 |                                                |
       2 - 4    us |          0 |                                                |
       4 - 8    us |          0 |                                                |
       8 - 16   us |          1 |                                                |
      16 - 32   us |          5 |                                                |
      32 - 64   us |          2 |                                                |
      64 - 128  us |          6 |                                                |
     128 - 256  us |          7 |                                                |
     256 - 512  us |          5 |                                                |
     512 - 1024 us |         22 | #                                              |
       1 - 2    ms |         36 | ##                                             |
       2 - 4    ms |         68 | #####                                          |
       4 - 8    ms |         22 | #                                              |
       8 - 16   ms |         91 | #######                                        |
      16 - 32   ms |         11 |                                                |
      32 - 64   ms |         26 | ##                                             |
      64 - 128  ms |        213 | #################                              |
     128 - 256  ms |         19 | #                                              |
     256 - 512  ms |         14 | #                                              |
     512 - 1024 ms |          5 |                                                |
       1 - ...   s |          8 |                                                |
  root@x1:~#

  root@x1:~# perf trace -e bpf perf ftrace latency --use-bpf -T schedule
     0.000 ( 0.015 ms): perf/2944525 bpf(cmd: 36, uattr: 0x7ffe80de7b40, size: 8)                          = -1 EOPNOTSUPP (Operation not supported)
     0.025 ( 0.102 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7870, size: 148)                 = 8
     0.136 ( 0.026 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7930, size: 148)                 = 8
     0.174 ( 0.026 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de77e0, size: 148)                 = 8
     0.205 ( 0.010 ms): perf/2944525 bpf(uattr: 0x7ffe80de7990, size: 80)                                  = 8
     0.227 ( 0.011 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7810, size: 40)                   = 8
     0.244 ( 0.004 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7880, size: 40)                   = 8
     0.257 ( 0.006 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7660, size: 40)                   = 8
     0.265 ( 0.058 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7730, size: 148)                 = 9
     0.330 ( 0.004 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de78e0, size: 40)                   = 8
     0.337 ( 0.003 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7890, size: 40)                   = 8
     0.343 ( 0.004 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7880, size: 40)                   = 8
     0.349 ( 0.003 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de78b0, size: 40)                   = 8
     0.355 ( 0.004 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7890, size: 40)                   = 8
     0.361 ( 0.003 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de78b0, size: 40)                   = 8
     0.367 ( 0.003 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7880, size: 40)                   = 8
     0.373 ( 0.014 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7a00, size: 40)                   = 8
     0.390 ( 0.358 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80)                                  = 9
     0.763 ( 0.014 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80)                                  = 9
     0.783 ( 0.011 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80)                                  = 9
     0.798 ( 0.017 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80)                                  = 9
     0.819 ( 0.003 ms): perf/2944525 bpf(uattr: 0x7ffe80de7700, size: 80)                                  = 9
     0.824 ( 0.047 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de76c0, size: 148)                 = 10
     0.878 ( 0.008 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80)                                  = 9
     0.891 ( 0.014 ms): perf/2944525 bpf(cmd: MAP_UPDATE_ELEM, uattr: 0x7ffe80de79e0, size: 32)            = 0
     0.910 ( 0.103 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7880, size: 148)                 = 9
     1.016 ( 0.143 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7880, size: 148)                 = 10
     3.777 ( 0.068 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7570, size: 148)                 = 12
     3.848 ( 0.003 ms): perf/2944525 bpf(cmd: LINK_CREATE, uattr: 0x7ffe80de7550, size: 64)                = -1 EBADF (Bad file descriptor)
     3.859 ( 0.006 ms): perf/2944525 bpf(cmd: LINK_CREATE, uattr: 0x7ffe80de77c0, size: 64)                = 12
     6.504 ( 0.010 ms): perf/2944525 bpf(cmd: LINK_CREATE, uattr: 0x7ffe80de77c0, size: 64)                = 14
^C#   DURATION     |      COUNT | GRAPH                                          |
     0 - 1    us |          0 |                                                |
     1 - 2    us |          0 |                                                |
     2 - 4    us |          1 |                                                |
     4 - 8    us |          3 |                                                |
     8 - 16   us |          3 |                                                |
    16 - 32   us |         11 |                                                |
    32 - 64   us |          9 |                                                |
    64 - 128  us |         17 |                                                |
   128 - 256  us |         30 | #                                              |
   256 - 512  us |         20 |                                                |
   512 - 1024 us |         42 | #                                              |
     1 - 2    ms |        151 | ######                                         |
     2 - 4    ms |        106 | ####                                           |
     4 - 8    ms |         18 |                                                |
     8 - 16   ms |        149 | ######                                         |
    16 - 32   ms |         30 | #                                              |
    32 - 64   ms |         17 |                                                |
    64 - 128  ms |        360 | ###############                                |
   128 - 256  ms |         52 | ##                                             |
   256 - 512  ms |         18 |                                                |
   512 - 1024 ms |         28 | #                                              |
     1 - ...   s |          5 |                                                |
  root@x1:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240902200515.2103769-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_ftrace.c                | 8 ++++----
 tools/perf/util/bpf_skel/func_latency.bpf.c | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_ftrace.c b/tools/perf/util/bpf_ftrace.c
index 7a4297d8fd2c..06d1c4018407 100644
--- a/tools/perf/util/bpf_ftrace.c
+++ b/tools/perf/util/bpf_ftrace.c
@@ -40,13 +40,17 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 	if (ftrace->target.cpu_list) {
 		ncpus = perf_cpu_map__nr(ftrace->evlist->core.user_requested_cpus);
 		bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
+		skel->rodata->has_cpu = 1;
 	}
 
 	if (target__has_task(&ftrace->target) || target__none(&ftrace->target)) {
 		ntasks = perf_thread_map__nr(ftrace->evlist->core.threads);
 		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
+		skel->rodata->has_task = 1;
 	}
 
+	skel->rodata->use_nsec = ftrace->use_nsec;
+
 	set_max_rlimit();
 
 	err = func_latency_bpf__load(skel);
@@ -59,7 +63,6 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 		u32 cpu;
 		u8 val = 1;
 
-		skel->bss->has_cpu = 1;
 		fd = bpf_map__fd(skel->maps.cpu_filter);
 
 		for (i = 0; i < ncpus; i++) {
@@ -72,7 +75,6 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 		u32 pid;
 		u8 val = 1;
 
-		skel->bss->has_task = 1;
 		fd = bpf_map__fd(skel->maps.task_filter);
 
 		for (i = 0; i < ntasks; i++) {
@@ -81,8 +83,6 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 		}
 	}
 
-	skel->bss->use_nsec = ftrace->use_nsec;
-
 	skel->links.func_begin = bpf_program__attach_kprobe(skel->progs.func_begin,
 							    false, func->name);
 	if (IS_ERR(skel->links.func_begin)) {
diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c
index 9d01e3af7479..f613dc9cb123 100644
--- a/tools/perf/util/bpf_skel/func_latency.bpf.c
+++ b/tools/perf/util/bpf_skel/func_latency.bpf.c
@@ -37,9 +37,10 @@ struct {
 
 
 int enabled = 0;
-int has_cpu = 0;
-int has_task = 0;
-int use_nsec = 0;
+
+const volatile int has_cpu = 0;
+const volatile int has_task = 0;
+const volatile int use_nsec = 0;
 
 SEC("kprobe/func")
 int BPF_PROG(func_begin)
-- 
cgit v1.2.3


From 066fd840873f2187deb4a646c5f531a8dba2fd36 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 2 Sep 2024 13:05:13 -0700
Subject: perf kwork: Constify control data for BPF

The control knobs set before loading BPF programs should be declared as
'const volatile' so that it can be optimized by the BPF core.

Committer testing:

  root@x1:~# perf kwork report --use-bpf
  Starting trace, Hit <Ctrl+C> to stop and report
  ^C
    Kwork Name                     | Cpu  | Total Runtime | Count     | Max runtime   | Max runtime start   | Max runtime end     |
   --------------------------------------------------------------------------------------------------------------------------------
    (w)intel_atomic_commit_work [  | 0009 |     18.680 ms |         2 |     18.553 ms |     362410.681580 s |     362410.700133 s |
    (w)pm_runtime_work             | 0007 |     13.300 ms |         1 |     13.300 ms |     362410.254996 s |     362410.268295 s |
    (w)intel_atomic_commit_work [  | 0009 |      9.846 ms |         2 |      9.717 ms |     362410.172352 s |     362410.182069 s |
    (w)acpi_ec_event_processor     | 0002 |      8.106 ms |         1 |      8.106 ms |     362410.463187 s |     362410.471293 s |
    (s)SCHED:7                     | 0000 |      1.351 ms |       106 |      0.063 ms |     362410.658017 s |     362410.658080 s |
    i915:157                       | 0008 |      0.994 ms |        13 |      0.361 ms |     362411.222125 s |     362411.222486 s |
    (s)SCHED:7                     | 0001 |      0.703 ms |        98 |      0.047 ms |     362410.245004 s |     362410.245051 s |
    (s)SCHED:7                     | 0005 |      0.674 ms |        42 |      0.074 ms |     362411.483039 s |     362411.483113 s |
    (s)NET_RX:3                    | 0001 |      0.556 ms |        10 |      0.079 ms |     362411.066388 s |     362411.066467 s |
  <SNIP>

  root@x1:~# perf trace -e bpf --max-events 5 perf kwork report --use-bpf
       0.000 ( 0.016 ms): perf/2948007 bpf(cmd: 36, uattr: 0x7ffededa6660, size: 8)          = -1 EOPNOTSUPP (Operation not supported)
       0.026 ( 0.106 ms): perf/2948007 bpf(cmd: PROG_LOAD, uattr: 0x7ffededa6390, size: 148) = 12
       0.152 ( 0.032 ms): perf/2948007 bpf(cmd: PROG_LOAD, uattr: 0x7ffededa6450, size: 148) = 12
      26.247 ( 0.138 ms): perf/2948007 bpf(cmd: PROG_LOAD, uattr: 0x7ffededa6300, size: 148) = 12
      26.396 ( 0.012 ms): perf/2948007 bpf(uattr: 0x7ffededa64b0, size: 80)                  = 12
  Starting trace, Hit <Ctrl+C> to stop and report
  root@x1:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Link: https://lore.kernel.org/r/20240902200515.2103769-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_kwork.c                | 9 +++++----
 tools/perf/util/bpf_kwork_top.c            | 7 ++++---
 tools/perf/util/bpf_skel/kwork_top.bpf.c   | 2 +-
 tools/perf/util/bpf_skel/kwork_trace.bpf.c | 5 +++--
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_kwork.c b/tools/perf/util/bpf_kwork.c
index 44f0f708a15d..6c7126b7670d 100644
--- a/tools/perf/util/bpf_kwork.c
+++ b/tools/perf/util/bpf_kwork.c
@@ -176,8 +176,6 @@ static int setup_filters(struct perf_kwork *kwork)
 			bpf_map_update_elem(fd, &cpu.cpu, &val, BPF_ANY);
 		}
 		perf_cpu_map__put(map);
-
-		skel->bss->has_cpu_filter = 1;
 	}
 
 	if (kwork->profile_name != NULL) {
@@ -197,8 +195,6 @@ static int setup_filters(struct perf_kwork *kwork)
 
 		key = 0;
 		bpf_map_update_elem(fd, &key, kwork->profile_name, BPF_ANY);
-
-		skel->bss->has_name_filter = 1;
 	}
 
 	return 0;
@@ -239,6 +235,11 @@ int perf_kwork__trace_prepare_bpf(struct perf_kwork *kwork)
 			class_bpf->load_prepare(kwork);
 	}
 
+	if (kwork->cpu_list != NULL)
+		skel->rodata->has_cpu_filter = 1;
+	if (kwork->profile_name != NULL)
+		skel->rodata->has_name_filter = 1;
+
 	if (kwork_trace_bpf__load(skel)) {
 		pr_debug("Failed to load kwork trace skeleton\n");
 		goto out;
diff --git a/tools/perf/util/bpf_kwork_top.c b/tools/perf/util/bpf_kwork_top.c
index 22a3b00a1e23..7261cad43468 100644
--- a/tools/perf/util/bpf_kwork_top.c
+++ b/tools/perf/util/bpf_kwork_top.c
@@ -151,14 +151,12 @@ static int setup_filters(struct perf_kwork *kwork)
 			bpf_map_update_elem(fd, &cpu.cpu, &val, BPF_ANY);
 		}
 		perf_cpu_map__put(map);
-
-		skel->bss->has_cpu_filter = 1;
 	}
 
 	return 0;
 }
 
-int perf_kwork__top_prepare_bpf(struct perf_kwork *kwork __maybe_unused)
+int perf_kwork__top_prepare_bpf(struct perf_kwork *kwork)
 {
 	struct bpf_program *prog;
 	struct kwork_class *class;
@@ -193,6 +191,9 @@ int perf_kwork__top_prepare_bpf(struct perf_kwork *kwork __maybe_unused)
 			class_bpf->load_prepare();
 	}
 
+	if (kwork->cpu_list)
+		skel->rodata->has_cpu_filter = 1;
+
 	if (kwork_top_bpf__load(skel)) {
 		pr_debug("Failed to load kwork top skeleton\n");
 		goto out;
diff --git a/tools/perf/util/bpf_skel/kwork_top.bpf.c b/tools/perf/util/bpf_skel/kwork_top.bpf.c
index 84c15ccbab44..594da91965a2 100644
--- a/tools/perf/util/bpf_skel/kwork_top.bpf.c
+++ b/tools/perf/util/bpf_skel/kwork_top.bpf.c
@@ -84,7 +84,7 @@ struct {
 
 int enabled = 0;
 
-int has_cpu_filter = 0;
+const volatile int has_cpu_filter = 0;
 
 __u64 from_timestamp = 0;
 __u64 to_timestamp = 0;
diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c
index 063c124e0999..cbd79bc4b330 100644
--- a/tools/perf/util/bpf_skel/kwork_trace.bpf.c
+++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c
@@ -68,8 +68,9 @@ struct {
 } perf_kwork_name_filter SEC(".maps");
 
 int enabled = 0;
-int has_cpu_filter = 0;
-int has_name_filter = 0;
+
+const volatile int has_cpu_filter = 0;
+const volatile int has_name_filter = 0;
 
 static __always_inline int local_strncmp(const char *s1,
 					 unsigned int sz, const char *s2)
-- 
cgit v1.2.3


From 4afdc00c378f34943fb3a3cc08db4babdacb5c5b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 2 Sep 2024 13:05:14 -0700
Subject: perf lock contention: Constify control data for BPF

The control knobs set before loading BPF programs should be declared as
'const volatile' so that it can be optimized by the BPF core.

Committer testing:

  root@x1:~# perf lock contention --use-bpf
   contended   total wait     max wait     avg wait         type   caller

           5     31.57 us     14.93 us      6.31 us        mutex   btrfs_delayed_update_inode+0x43
           1     16.91 us     16.91 us     16.91 us      rwsem:R   btrfs_tree_read_lock_nested+0x1b
           1     15.13 us     15.13 us     15.13 us     spinlock   btrfs_getattr+0xd1
           1      6.65 us      6.65 us      6.65 us      rwsem:R   btrfs_tree_read_lock_nested+0x1b
           1      4.34 us      4.34 us      4.34 us     spinlock   process_one_work+0x1a9
  root@x1:~#
  root@x1:~# perf trace -e bpf --max-events 10 perf lock contention --use-bpf
       0.000 ( 0.013 ms): :2948281/2948281 bpf(cmd: 36, uattr: 0x7ffd5f12d730, size: 8)          = -1 EOPNOTSUPP (Operation not supported)
       0.024 ( 0.120 ms): :2948281/2948281 bpf(cmd: PROG_LOAD, uattr: 0x7ffd5f12d460, size: 148) = 16
       0.158 ( 0.034 ms): :2948281/2948281 bpf(cmd: PROG_LOAD, uattr: 0x7ffd5f12d520, size: 148) = 16
      26.653 ( 0.154 ms): perf/2948281 bpf(cmd: PROG_LOAD, uattr: 0x7ffd5f12d3d0, size: 148)     = 16
      26.825 ( 0.014 ms): perf/2948281 bpf(uattr: 0x7ffd5f12d580, size: 80)                      = 16
      87.924 ( 0.038 ms): perf/2948281 bpf(cmd: BTF_LOAD, uattr: 0x7ffd5f12d400, size: 40)       = 16
      87.988 ( 0.006 ms): perf/2948281 bpf(cmd: BTF_LOAD, uattr: 0x7ffd5f12d470, size: 40)       = 16
      88.019 ( 0.006 ms): perf/2948281 bpf(cmd: BTF_LOAD, uattr: 0x7ffd5f12d250, size: 40)       = 16
      88.029 ( 0.172 ms): perf/2948281 bpf(cmd: PROG_LOAD, uattr: 0x7ffd5f12d320, size: 148)     = 17
      88.217 ( 0.005 ms): perf/2948281 bpf(cmd: BTF_LOAD, uattr: 0x7ffd5f12d4d0, size: 40)       = 16
  root@x1:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240902200515.2103769-5-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_lock_contention.c          | 45 ++++++++++++++------------
 tools/perf/util/bpf_skel/lock_contention.bpf.c | 27 ++++++++--------
 2 files changed, 38 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index bc4e92c0c08b..41a1ad087895 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -46,14 +46,22 @@ int lock_contention_prepare(struct lock_contention *con)
 	else
 		bpf_map__set_max_entries(skel->maps.stacks, 1);
 
-	if (target__has_cpu(target))
+	if (target__has_cpu(target)) {
+		skel->rodata->has_cpu = 1;
 		ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
-	if (target__has_task(target))
+	}
+	if (target__has_task(target)) {
+		skel->rodata->has_task = 1;
 		ntasks = perf_thread_map__nr(evlist->core.threads);
-	if (con->filters->nr_types)
+	}
+	if (con->filters->nr_types) {
+		skel->rodata->has_type = 1;
 		ntypes = con->filters->nr_types;
-	if (con->filters->nr_cgrps)
+	}
+	if (con->filters->nr_cgrps) {
+		skel->rodata->has_cgroup = 1;
 		ncgrps = con->filters->nr_cgrps;
+	}
 
 	/* resolve lock name filters to addr */
 	if (con->filters->nr_syms) {
@@ -82,6 +90,7 @@ int lock_contention_prepare(struct lock_contention *con)
 			con->filters->addrs = addrs;
 		}
 		naddrs = con->filters->nr_addrs;
+		skel->rodata->has_addr = 1;
 	}
 
 	bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
@@ -90,6 +99,16 @@ int lock_contention_prepare(struct lock_contention *con)
 	bpf_map__set_max_entries(skel->maps.addr_filter, naddrs);
 	bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
 
+	skel->rodata->stack_skip = con->stack_skip;
+	skel->rodata->aggr_mode = con->aggr_mode;
+	skel->rodata->needs_callstack = con->save_callstack;
+	skel->rodata->lock_owner = con->owner;
+
+	if (con->aggr_mode == LOCK_AGGR_CGROUP || con->filters->nr_cgrps) {
+		if (cgroup_is_v2("perf_event"))
+			skel->rodata->use_cgroup_v2 = 1;
+	}
+
 	if (lock_contention_bpf__load(skel) < 0) {
 		pr_err("Failed to load lock-contention BPF skeleton\n");
 		return -1;
@@ -99,7 +118,6 @@ int lock_contention_prepare(struct lock_contention *con)
 		u32 cpu;
 		u8 val = 1;
 
-		skel->bss->has_cpu = 1;
 		fd = bpf_map__fd(skel->maps.cpu_filter);
 
 		for (i = 0; i < ncpus; i++) {
@@ -112,7 +130,6 @@ int lock_contention_prepare(struct lock_contention *con)
 		u32 pid;
 		u8 val = 1;
 
-		skel->bss->has_task = 1;
 		fd = bpf_map__fd(skel->maps.task_filter);
 
 		for (i = 0; i < ntasks; i++) {
@@ -125,7 +142,6 @@ int lock_contention_prepare(struct lock_contention *con)
 		u32 pid = evlist->workload.pid;
 		u8 val = 1;
 
-		skel->bss->has_task = 1;
 		fd = bpf_map__fd(skel->maps.task_filter);
 		bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
 	}
@@ -133,7 +149,6 @@ int lock_contention_prepare(struct lock_contention *con)
 	if (con->filters->nr_types) {
 		u8 val = 1;
 
-		skel->bss->has_type = 1;
 		fd = bpf_map__fd(skel->maps.type_filter);
 
 		for (i = 0; i < con->filters->nr_types; i++)
@@ -143,7 +158,6 @@ int lock_contention_prepare(struct lock_contention *con)
 	if (con->filters->nr_addrs) {
 		u8 val = 1;
 
-		skel->bss->has_addr = 1;
 		fd = bpf_map__fd(skel->maps.addr_filter);
 
 		for (i = 0; i < con->filters->nr_addrs; i++)
@@ -153,25 +167,14 @@ int lock_contention_prepare(struct lock_contention *con)
 	if (con->filters->nr_cgrps) {
 		u8 val = 1;
 
-		skel->bss->has_cgroup = 1;
 		fd = bpf_map__fd(skel->maps.cgroup_filter);
 
 		for (i = 0; i < con->filters->nr_cgrps; i++)
 			bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
 	}
 
-	/* these don't work well if in the rodata section */
-	skel->bss->stack_skip = con->stack_skip;
-	skel->bss->aggr_mode = con->aggr_mode;
-	skel->bss->needs_callstack = con->save_callstack;
-	skel->bss->lock_owner = con->owner;
-
-	if (con->aggr_mode == LOCK_AGGR_CGROUP) {
-		if (cgroup_is_v2("perf_event"))
-			skel->bss->use_cgroup_v2 = 1;
-
+	if (con->aggr_mode == LOCK_AGGR_CGROUP)
 		read_all_cgroups(&con->cgroups);
-	}
 
 	bpf_program__set_autoload(skel->progs.collect_lock_syms, false);
 
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 52a876b42699..1069bda5d733 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -117,21 +117,22 @@ struct mm_struct___new {
 } __attribute__((preserve_access_index));
 
 /* control flags */
-int enabled;
-int has_cpu;
-int has_task;
-int has_type;
-int has_addr;
-int has_cgroup;
-int needs_callstack;
-int stack_skip;
-int lock_owner;
-
-int use_cgroup_v2;
-int perf_subsys_id = -1;
+const volatile int has_cpu;
+const volatile int has_task;
+const volatile int has_type;
+const volatile int has_addr;
+const volatile int has_cgroup;
+const volatile int needs_callstack;
+const volatile int stack_skip;
+const volatile int lock_owner;
+const volatile int use_cgroup_v2;
 
 /* determine the key of lock stat */
-int aggr_mode;
+const volatile int aggr_mode;
+
+int enabled;
+
+int perf_subsys_id = -1;
 
 __u64 end_ts;
 
-- 
cgit v1.2.3


From 8b3b1bb3ea1f930d44d79dfb0b8cb7d62db08ed6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 2 Sep 2024 13:05:15 -0700
Subject: perf record offcpu: Constify control data for BPF

The control knobs set before loading BPF programs should be declared as
'const volatile' so that it can be optimized by the BPF core.

Committer testing:

  root@x1:~# perf record --off-cpu
  ^C[ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 1.807 MB perf.data (5645 samples) ]

  root@x1:~# perf evlist
  cpu_atom/cycles/P
  cpu_core/cycles/P
  offcpu-time
  dummy:u
  root@x1:~# perf evlist -v
  cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0xa00000000, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000000, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  offcpu-time: type: 1 (software), size: 136, config: 0xa (PERF_COUNT_SW_BPF_OUTPUT), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, sample_id_all: 1
  dummy:u: type: 1 (software), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|IDENTIFIER, read_format: ID|LOST, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
  root@x1:~# perf trace -e bpf --max-events 5 perf record --off-cpu
       0.000 ( 0.015 ms): :2949124/2949124 bpf(cmd: 36, uattr: 0x7ffefc6dbe30, size: 8)          = -1 EOPNOTSUPP (Operation not supported)
       0.031 ( 0.115 ms): :2949124/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbb60, size: 148) = 14
       0.159 ( 0.037 ms): :2949124/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbc20, size: 148) = 14
      23.868 ( 0.144 ms): perf/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbad0, size: 148)     = 14
      24.027 ( 0.014 ms): perf/2949124 bpf(uattr: 0x7ffefc6dbc80, size: 80)                      = 14
  root@x1:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240902200515.2103769-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_off_cpu.c          | 16 ++++++++--------
 tools/perf/util/bpf_skel/off_cpu.bpf.c |  9 +++++----
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
index 6af36142dc5a..a590a8ac1f9d 100644
--- a/tools/perf/util/bpf_off_cpu.c
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -73,14 +73,12 @@ static void off_cpu_start(void *arg)
 	struct evlist *evlist = arg;
 
 	/* update task filter for the given workload */
-	if (!skel->bss->has_cpu && !skel->bss->has_task &&
+	if (skel->rodata->has_task && skel->rodata->uses_tgid &&
 	    perf_thread_map__pid(evlist->core.threads, 0) != -1) {
 		int fd;
 		u32 pid;
 		u8 val = 1;
 
-		skel->bss->has_task = 1;
-		skel->bss->uses_tgid = 1;
 		fd = bpf_map__fd(skel->maps.task_filter);
 		pid = perf_thread_map__pid(evlist->core.threads, 0);
 		bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
@@ -148,6 +146,7 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
 	if (target->cpu_list) {
 		ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
 		bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
+		skel->rodata->has_cpu = 1;
 	}
 
 	if (target->pid) {
@@ -173,11 +172,16 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
 			ntasks = MAX_PROC;
 
 		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
+		skel->rodata->has_task = 1;
+		skel->rodata->uses_tgid = 1;
 	} else if (target__has_task(target)) {
 		ntasks = perf_thread_map__nr(evlist->core.threads);
 		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
+		skel->rodata->has_task = 1;
 	} else if (target__none(target)) {
 		bpf_map__set_max_entries(skel->maps.task_filter, MAX_PROC);
+		skel->rodata->has_task = 1;
+		skel->rodata->uses_tgid = 1;
 	}
 
 	if (evlist__first(evlist)->cgrp) {
@@ -186,6 +190,7 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
 
 		if (!cgroup_is_v2("perf_event"))
 			skel->rodata->uses_cgroup_v1 = true;
+		skel->rodata->has_cgroup = 1;
 	}
 
 	if (opts->record_cgroup) {
@@ -208,7 +213,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
 		u32 cpu;
 		u8 val = 1;
 
-		skel->bss->has_cpu = 1;
 		fd = bpf_map__fd(skel->maps.cpu_filter);
 
 		for (i = 0; i < ncpus; i++) {
@@ -220,8 +224,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
 	if (target->pid) {
 		u8 val = 1;
 
-		skel->bss->has_task = 1;
-		skel->bss->uses_tgid = 1;
 		fd = bpf_map__fd(skel->maps.task_filter);
 
 		strlist__for_each_entry(pos, pid_slist) {
@@ -240,7 +242,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
 		u32 pid;
 		u8 val = 1;
 
-		skel->bss->has_task = 1;
 		fd = bpf_map__fd(skel->maps.task_filter);
 
 		for (i = 0; i < ntasks; i++) {
@@ -253,7 +254,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
 		struct evsel *evsel;
 		u8 val = 1;
 
-		skel->bss->has_cgroup = 1;
 		fd = bpf_map__fd(skel->maps.cgroup_filter);
 
 		evlist__for_each_entry(evlist, evsel) {
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index d877a0a9731f..c152116df72f 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -85,10 +85,11 @@ struct task_struct___old {
 } __attribute__((preserve_access_index));
 
 int enabled = 0;
-int has_cpu = 0;
-int has_task = 0;
-int has_cgroup = 0;
-int uses_tgid = 0;
+
+const volatile int has_cpu = 0;
+const volatile int has_task = 0;
+const volatile int has_cgroup = 0;
+const volatile int uses_tgid = 0;
 
 const volatile bool has_prev_state = false;
 const volatile bool needs_cgroup = false;
-- 
cgit v1.2.3


From bf0db8c759ba137cebfeda9eecc1f728cb14dab7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 29 Feb 2024 08:18:28 -0800
Subject: perf script: Minimize "not reaching sample" for '-F +brstackinsn'

In some situations 'perf script -F +brstackinsn' sees a lot of "not
reaching sample" messages.

This happens when the last LBR block before the sample contains a branch
that is not in the LBR, and the instruction dumping stops.

  $ perf record -b  emacs -Q --batch '()'
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.396 MB perf.data (443 samples) ]
  $ perf script -F +brstackinsn
  ...
          00007f0ab2d171a4        insn: 41 0f 94 c0
          00007f0ab2d171a8        insn: 83 fa 01
          00007f0ab2d171ab        insn: 74 d3                     # PRED 6 cycles [313] 1.00 IPC
          00007f0ab2d17180        insn: 45 84 c0
          00007f0ab2d17183        insn: 74 28
          ... not reaching sample ...

  $ perf script -F +brstackinsn | grep -c reach
  136
  $

This is a problem for further analysis that wants to see the full code
upto the sample.

There are two common cases where the message is bogus:

- The LBR only logs taken branches, but the branch might be a
  conditional branch that is not taken (that is the most common case
  actually)

- The LBR sampling uses a filter ignoring some branches, but the perf
  script check checks for all branches.

This patch fixes these two conditions, by only checking for conditional
branches, as well as checking the perf_event_attr's branch filter
attributes.

For the test case above it fixes all the messages:

  $ ./perf script -F +brstackinsn | grep -c reach
  0

Note that there are still conditions when the message is hit --
sometimes there can be a unconditional branch that misses the LBR update
before the sample -- but they are much more rare now.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Link: https://lore.kernel.org/r/20240229161828.386397-1-ak@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c                              | 6 ++++--
 tools/perf/util/dump-insn.c                              | 2 +-
 tools/perf/util/dump-insn.h                              | 2 +-
 tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c | 5 +++--
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 206b08426555..dbe792b52c5c 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1428,7 +1428,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 	 * Due to pipeline delays the LBRs might be missing a branch
 	 * or two, which can result in very large or negative blocks
 	 * between final branch and sample. When this happens just
-	 * continue walking after the last TO until we hit a branch.
+	 * continue walking after the last TO.
 	 */
 	start = entries[0].to;
 	end = sample->ip;
@@ -1463,7 +1463,9 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 		printed += fprintf(fp, "\n");
 		if (ilen == 0)
 			break;
-		if (arch_is_branch(buffer + off, len - off, x.is64bit) && start + off != sample->ip) {
+		if ((attr->branch_sample_type == 0 || attr->branch_sample_type & PERF_SAMPLE_BRANCH_ANY)
+				&& arch_is_uncond_branch(buffer + off, len - off, x.is64bit)
+				&& start + off != sample->ip) {
 			/*
 			 * Hit a missing branch. Just stop.
 			 */
diff --git a/tools/perf/util/dump-insn.c b/tools/perf/util/dump-insn.c
index 2bd8585db93c..c1cc0ade48d0 100644
--- a/tools/perf/util/dump-insn.c
+++ b/tools/perf/util/dump-insn.c
@@ -15,7 +15,7 @@ const char *dump_insn(struct perf_insn *x __maybe_unused,
 }
 
 __weak
-int arch_is_branch(const unsigned char *buf __maybe_unused,
+int arch_is_uncond_branch(const unsigned char *buf __maybe_unused,
 		   size_t len __maybe_unused,
 		   int x86_64 __maybe_unused)
 {
diff --git a/tools/perf/util/dump-insn.h b/tools/perf/util/dump-insn.h
index 4a7797dd6d09..20d4d7bb5275 100644
--- a/tools/perf/util/dump-insn.h
+++ b/tools/perf/util/dump-insn.h
@@ -21,6 +21,6 @@ struct perf_insn {
 
 const char *dump_insn(struct perf_insn *x, u64 ip,
 		      u8 *inbuf, int inlen, int *lenp);
-int arch_is_branch(const unsigned char *buf, size_t len, int x86_64);
+int arch_is_uncond_branch(const unsigned char *buf, size_t len, int x86_64);
 
 #endif
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
index 4407130d91f8..47cf35799a4d 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
@@ -209,12 +209,13 @@ int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64,
 	return 0;
 }
 
-int arch_is_branch(const unsigned char *buf, size_t len, int x86_64)
+int arch_is_uncond_branch(const unsigned char *buf, size_t len, int x86_64)
 {
 	struct intel_pt_insn in;
 	if (intel_pt_get_insn(buf, len, x86_64, &in) < 0)
 		return -1;
-	return in.branch != INTEL_PT_BR_NO_BRANCH;
+	return in.branch == INTEL_PT_BR_UNCONDITIONAL ||
+	       in.branch == INTEL_PT_BR_INDIRECT;
 }
 
 const char *dump_insn(struct perf_insn *x, uint64_t ip __maybe_unused,
-- 
cgit v1.2.3


From b4bcdff7e839237d89a2aa15b6042b96b0240ac4 Mon Sep 17 00:00:00 2001
From: Abhinav Jain <jain.abhinav177@gmail.com>
Date: Sat, 10 Aug 2024 19:23:33 +0530
Subject: selftests: filesystems: fix warn_unused_result build warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add return value checks for read & write calls in test_listmount_ns
function. This patch resolves below compilation warnings:

```
statmount_test_ns.c: In function ‘test_listmount_ns’:

statmount_test_ns.c:322:17: warning: ignoring return value of ‘write’
declared with attribute ‘warn_unused_result’ [-Wunused-result]

statmount_test_ns.c:323:17: warning: ignoring return value of ‘read’
declared with attribute ‘warn_unused_result’ [-Wunused-result]
```

Signed-off-by: Abhinav Jain <jain.abhinav177@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/filesystems/statmount/statmount_test_ns.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
index e044f5fc57fd..70cb0c8b21cf 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
@@ -319,8 +319,11 @@ static void test_listmount_ns(void)
 		 * Tell our parent how many mounts we have, and then wait for it
 		 * to tell us we're done.
 		 */
-		write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts));
-		read(parent_ready_pipe[0], &cval, sizeof(cval));
+		if (write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)) !=
+					sizeof(nr_mounts))
+			ret = NSID_ERROR;
+		if (read(parent_ready_pipe[0], &cval, sizeof(cval)) != sizeof(cval))
+			ret = NSID_ERROR;
 		exit(NSID_PASS);
 	}
 
-- 
cgit v1.2.3


From 3c6b818b097dd6932859bcc3d6722a74ec5931c1 Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Wed, 28 Aug 2024 02:31:29 -0700
Subject: tools/iio: Add memory allocation failure check for trigger_name

Added a check to handle memory allocation failure for `trigger_name`
and return `-ENOMEM`.

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Link: https://patch.msgid.link/20240828093129.3040-1-zhujun2@cmss.chinamobile.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 tools/iio/iio_generic_buffer.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/iio/iio_generic_buffer.c b/tools/iio/iio_generic_buffer.c
index 0d0a7a19d6f9..9ef5ee087eda 100644
--- a/tools/iio/iio_generic_buffer.c
+++ b/tools/iio/iio_generic_buffer.c
@@ -498,6 +498,10 @@ int main(int argc, char **argv)
 			return -ENOMEM;
 		}
 		trigger_name = malloc(IIO_MAX_NAME_LENGTH);
+		if (!trigger_name) {
+			ret = -ENOMEM;
+			goto error;
+		}
 		ret = read_sysfs_string("name", trig_dev_name, trigger_name);
 		free(trig_dev_name);
 		if (ret < 0) {
-- 
cgit v1.2.3


From 575eec218059f90bf07a4e26b531fd6d54fe1769 Mon Sep 17 00:00:00 2001
From: Yang Jihong <yangjihong@bytedance.com>
Date: Mon, 12 Aug 2024 21:26:05 +0800
Subject: perf sched timehist: Skip print non-idle task samples when only show
 idle events

when only show idle events, runtime stats of non-idle tasks is not updated,
and the value is 0, there is no need to print non-idle samples.

Before:

  # perf sched timehist -I
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       wait time  sch delay   run time
                          [tid/pid]                          (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  ---------  ---------  ---------
   2090450.763235 [0000]  migration/0[15]                     0.000      0.000      0.000
   2090450.763268 [0001]  migration/1[21]                     0.000      0.000      0.000
   2090450.763309 [0002]  migration/2[27]                     0.000      0.000      0.000
   2090450.763343 [0003]  migration/3[33]                     0.000      0.000      0.000
   2090450.763469 [0004]  migration/4[39]                     0.000      0.000      0.000
   2090450.763501 [0005]  migration/5[45]                     0.000      0.000      0.000
   2090450.763622 [0006]  migration/6[51]                     0.000      0.000      0.000
   2090450.763660 [0007]  migration/7[57]                     0.000      0.000      0.000
   2090450.763741 [0009]  migration/9[69]                     0.000      0.000      0.000
   2090450.763862 [0010]  migration/10[75]                    0.000      0.000      0.000
   2090450.763894 [0011]  migration/11[81]                    0.000      0.000      0.000
   2090450.764021 [0012]  migration/12[87]                    0.000      0.000      0.000
   2090450.764056 [0013]  migration/13[93]                    0.000      0.000      0.000
   2090450.764135 [0014]  migration/14[99]                    0.000      0.000      0.000
   2090450.764163 [0015]  migration/15[105]                   0.000      0.000      0.000
   2090450.764292 [0016]  migration/16[111]                   0.000      0.000      0.000
   2090450.764371 [0017]  migration/17[117]                   0.000      0.000      0.000
   2090450.764422 [0018]  migration/18[123]                   0.000      0.000      0.000
   2090450.764490 [0000]  <idle>                              0.000      0.000      1.255
   2090450.764505 [0000]  s1-perf[8235/7168]                  0.000      0.000      0.000
   2090450.764571 [0016]  <idle>                              0.000      0.000      0.278
   2090450.764588 [0010]  <idle>                              0.000      0.000      0.725
   2090450.764590 [0016]  s1-agent[7179/7162]                 0.000      0.000      0.000
   2090450.764635 [0000]  <idle>                              0.015      0.015      0.129
   2090450.764637 [0017]  <idle>                              0.000      0.000      0.266
   2090450.764639 [0000]  s1-perf[8235/7168]                  0.000      0.000      0.000
   2090450.764668 [0017]  s1-agent[7180/7162]                 0.000      0.000      0.000
   2090450.764669 [0000]  <idle>                              0.003      0.003      0.029
   2090450.764672 [0000]  s1-perf[8235/7168]                  0.000      0.000      0.000
   2090450.764683 [0000]  <idle>                              0.003      0.003      0.010

After:

  # perf sched timehist -I
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       wait time  sch delay   run time
                          [tid/pid]                          (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  ---------  ---------  ---------
   2090450.764490 [0000]  <idle>                              0.000      0.000      1.255
   2090450.764571 [0016]  <idle>                              0.000      0.000      0.278
   2090450.764588 [0010]  <idle>                              0.000      0.000      0.725
   2090450.764635 [0000]  <idle>                              0.015      0.015      0.129
   2090450.764637 [0017]  <idle>                              0.000      0.000      0.266
   2090450.764669 [0000]  <idle>                              0.003      0.003      0.029
   2090450.764683 [0000]  <idle>                              0.003      0.003      0.010
   2090450.764688 [0016]  <idle>                              0.019      0.019      0.097
   2090450.764694 [0000]  <idle>                              0.001      0.001      0.009
   2090450.764706 [0000]  <idle>                              0.001      0.001      0.010
   2090450.764725 [0002]  <idle>                              0.000      0.000      1.415
   2090450.764728 [0000]  <idle>                              0.002      0.002      0.019
   2090450.764823 [0000]  <idle>                              0.003      0.003      0.091
   2090450.764838 [0019]  <idle>                              0.000      0.000      0.154
   2090450.764865 [0002]  <idle>                              0.109      0.109      0.029
   2090450.764866 [0000]  <idle>                              0.012      0.012      0.030
   2090450.764880 [0002]  <idle>                              0.013      0.013      0.001
   2090450.764880 [0000]  <idle>                              0.002      0.002      0.011
   2090450.764896 [0000]  <idle>                              0.001      0.001      0.013
   2090450.764903 [0019]  <idle>                              0.063      0.063      0.002
   2090450.764908 [0019]  <idle>                              0.003      0.003      0.001

Fixes: 07235f84ece6b66f ("perf sched timehist: Add -I/--idle-hist option")
Signed-off-by: Yang Jihong <yangjihong@bytedance.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240812132606.3126490-1-yangjihong@bytedance.com
Reviewed-and-tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 1c386ebe4b98..72f0e7f97f56 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -2730,10 +2730,10 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
 
 			itr->last_thread = NULL;
 		}
-	}
 
-	if (!sched->summary_only)
-		timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
+		if (!sched->summary_only)
+			timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
+	}
 
 out:
 	if (sched->hist_time.start == 0 && t >= ptime->start)
-- 
cgit v1.2.3


From b93fb9cf45a99042f4472678fc67afd1de47c32e Mon Sep 17 00:00:00 2001
From: Yang Jihong <yangjihong@bytedance.com>
Date: Mon, 12 Aug 2024 21:26:06 +0800
Subject: perf sched timehist: Remove redundant BUG_ON in
 timehist_sched_change_event()

The BUG_ON(thread__tid(thread) != 0) in timehist_sched_change_event() is
redundant, remove it.

No functional change.

Fixes: 07235f84ece6b66f ("perf sched timehist: Add -I/--idle-hist option")
Reviewed-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Signed-off-by: Yang Jihong <yangjihong@bytedance.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240812132606.3126490-2-yangjihong@bytedance.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 72f0e7f97f56..dcae3f1f5f3c 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -2703,8 +2703,6 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
 			struct idle_thread_runtime *itr = (void *)tr;
 			struct thread_runtime *last_tr;
 
-			BUG_ON(thread__tid(thread) != 0);
-
 			if (itr->last_thread == NULL)
 				goto out;
 
-- 
cgit v1.2.3


From 3fcd740990dee9404f4f6ebeb7a31f0066849e46 Mon Sep 17 00:00:00 2001
From: Yang Jihong <yangjihong@bytedance.com>
Date: Mon, 19 Aug 2024 11:30:15 +0800
Subject: perf sched timehist: Add --show-prio option

The --show-prio option is used to display the priority of task.

It is disabled by default, which is consistent with original behavior.

The display format is xxx (priority does not change during task running)
or xxx->yyy (priority changes during task running)

Testcase:

  # perf sched record nice -n 9 true
  [ perf record: Woken up 0 times to write data ]
  [ perf record: Captured and wrote 0.497 MB perf.data ]

  # perf sched timehist -h

   Usage: perf sched timehist [<options>]

      -C, --cpu <cpu>       list of cpus to profile
      -D, --dump-raw-trace  dump raw trace in ASCII
      -f, --force           don't complain, do it
      -g, --call-graph      Display call chains if present (default on)
      -I, --idle-hist       Show idle events only
      -i, --input <file>    input file name
      -k, --vmlinux <file>  vmlinux pathname
      -M, --migrations      Show migration events
      -n, --next            Show next task
      -p, --pid <pid[,pid...]>
                            analyze events only for given process id(s)
      -s, --summary         Show only syscall summary with statistics
      -S, --with-summary    Show all syscalls and summary with statistics
      -t, --tid <tid[,tid...]>
                            analyze events only for given thread id(s)
      -V, --cpu-visual      Add CPU visual
      -v, --verbose         be more verbose (show symbol address, etc)
      -w, --wakeups         Show wakeup events
          --kallsyms <file>
                            kallsyms pathname
          --max-stack <n>   Maximum number of functions to display backtrace.
          --show-prio       Show task priority
          --state           Show task state when sched-out
          --symfs <directory>
                            Look for files with symbols relative to this directory
          --time <str>      Time span for analysis (start,stop)

  # perf sched timehist
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       wait time  sch delay   run time
                          [tid/pid]                          (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  ---------  ---------  ---------
     23952.006537 [0000]  perf[534]                           0.000      0.000      0.000
     23952.006593 [0000]  migration/0[19]                     0.000      0.014      0.056
     23952.006899 [0001]  perf[534]                           0.000      0.000      0.000
     23952.006947 [0001]  migration/1[22]                     0.000      0.015      0.047
     23952.007138 [0002]  perf[534]                           0.000      0.000      0.000
  <SNIP>

  # perf sched timehist --show-prio
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       prio      wait time  sch delay   run time
                          [tid/pid]                                    (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  --------  ---------  ---------  ---------
     23952.006537 [0000]  perf[534]                       120           0.000      0.000      0.000
     23952.006593 [0000]  migration/0[19]                 0             0.000      0.014      0.056
     23952.006899 [0001]  perf[534]                       120           0.000      0.000      0.000
  <SNIP>
     23952.034843 [0003]  nice[535]                       120->129      0.189      0.024     23.314
  <SNIP>
     23952.053838 [0005]  rcu_preempt[16]                 120           3.993      0.000      0.023
     23952.053990 [0005]  <idle>                          120           0.023      0.023      0.152
     23952.054137 [0006]  <idle>                          120           1.427      1.427     17.855
     23952.054278 [0007]  <idle>                          120           0.506      0.506      1.650

Signed-off-by: Yang Jihong <yangjihong@bytedance.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240819033016.2427235-2-yangjihong@bytedance.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-sched.txt |  3 ++
 tools/perf/builtin-sched.c              | 91 ++++++++++++++++++++++++++++++---
 2 files changed, 87 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 84d49f9241b1..3efa5c58418d 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -212,6 +212,9 @@ OPTIONS for 'perf sched timehist'
 --state::
 	Show task state when it switched out.
 
+--show-prio::
+	Show task priority.
+
 OPTIONS for 'perf sched replay'
 ------------------------------
 
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index dcae3f1f5f3c..ba6563dc93d0 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -228,6 +228,7 @@ struct perf_sched {
 	bool		show_next;
 	bool		show_migrations;
 	bool		show_state;
+	bool		show_prio;
 	u64		skipped_samples;
 	const char	*time_str;
 	struct perf_time_interval ptime;
@@ -258,6 +259,8 @@ struct thread_runtime {
 	bool comm_changed;
 
 	u64 migrations;
+
+	int prio;
 };
 
 /* per event run time data */
@@ -920,6 +923,11 @@ struct sort_dimension {
 	struct list_head	list;
 };
 
+static inline void init_prio(struct thread_runtime *r)
+{
+	r->prio = -1;
+}
+
 /*
  * handle runtime stats saved per thread
  */
@@ -932,6 +940,7 @@ static struct thread_runtime *thread__init_runtime(struct thread *thread)
 		return NULL;
 
 	init_stats(&r->run_stats);
+	init_prio(r);
 	thread__set_priv(thread, r);
 
 	return r;
@@ -2036,6 +2045,24 @@ static char *timehist_get_commstr(struct thread *thread)
 	return str;
 }
 
+/* prio field format: xxx or xxx->yyy */
+#define MAX_PRIO_STR_LEN 8
+static char *timehist_get_priostr(struct evsel *evsel,
+				  struct thread *thread,
+				  struct perf_sample *sample)
+{
+	static char prio_str[16];
+	int prev_prio = (int)evsel__intval(evsel, sample, "prev_prio");
+	struct thread_runtime *tr = thread__priv(thread);
+
+	if (tr->prio != prev_prio && tr->prio != -1)
+		scnprintf(prio_str, sizeof(prio_str), "%d->%d", tr->prio, prev_prio);
+	else
+		scnprintf(prio_str, sizeof(prio_str), "%d", prev_prio);
+
+	return prio_str;
+}
+
 static void timehist_header(struct perf_sched *sched)
 {
 	u32 ncpus = sched->max_cpu.cpu + 1;
@@ -2053,8 +2080,14 @@ static void timehist_header(struct perf_sched *sched)
 		printf(" ");
 	}
 
-	printf(" %-*s  %9s  %9s  %9s", comm_width,
-		"task name", "wait time", "sch delay", "run time");
+	if (sched->show_prio) {
+		printf(" %-*s  %-*s  %9s  %9s  %9s",
+		       comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
+		       "wait time", "sch delay", "run time");
+	} else {
+		printf(" %-*s  %9s  %9s  %9s", comm_width,
+		       "task name", "wait time", "sch delay", "run time");
+	}
 
 	if (sched->show_state)
 		printf("  %s", "state");
@@ -2069,8 +2102,14 @@ static void timehist_header(struct perf_sched *sched)
 	if (sched->show_cpu_visual)
 		printf(" %*s ", ncpus, "");
 
-	printf(" %-*s  %9s  %9s  %9s", comm_width,
-	       "[tid/pid]", "(msec)", "(msec)", "(msec)");
+	if (sched->show_prio) {
+		printf(" %-*s  %-*s  %9s  %9s  %9s",
+		       comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
+		       "(msec)", "(msec)", "(msec)");
+	} else {
+		printf(" %-*s  %9s  %9s  %9s", comm_width,
+		       "[tid/pid]", "(msec)", "(msec)", "(msec)");
+	}
 
 	if (sched->show_state)
 		printf("  %5s", "");
@@ -2085,9 +2124,15 @@ static void timehist_header(struct perf_sched *sched)
 	if (sched->show_cpu_visual)
 		printf(" %.*s ", ncpus, graph_dotted_line);
 
-	printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
-		graph_dotted_line, graph_dotted_line, graph_dotted_line,
-		graph_dotted_line);
+	if (sched->show_prio) {
+		printf(" %.*s  %.*s  %.9s  %.9s  %.9s",
+		       comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
+		       graph_dotted_line, graph_dotted_line, graph_dotted_line);
+	} else {
+		printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
+		       graph_dotted_line, graph_dotted_line, graph_dotted_line,
+		       graph_dotted_line);
+	}
 
 	if (sched->show_state)
 		printf("  %.5s", graph_dotted_line);
@@ -2134,6 +2179,9 @@ static void timehist_print_sample(struct perf_sched *sched,
 
 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
 
+	if (sched->show_prio)
+		printf(" %-*s ", MAX_PRIO_STR_LEN, timehist_get_priostr(evsel, thread, sample));
+
 	wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
 	print_sched_time(wait_time, 6);
 
@@ -2301,6 +2349,7 @@ static int init_idle_thread(struct thread *thread)
 	if (itr == NULL)
 		return -ENOMEM;
 
+	init_prio(&itr->tr);
 	init_stats(&itr->tr.run_stats);
 	callchain_init(&itr->callchain);
 	callchain_cursor_reset(&itr->cursor);
@@ -2627,6 +2676,30 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
 	return 0;
 }
 
+static void timehist_update_task_prio(struct evsel *evsel,
+				      struct perf_sample *sample,
+				      struct machine *machine)
+{
+	struct thread *thread;
+	struct thread_runtime *tr = NULL;
+	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
+	const u32 next_prio = evsel__intval(evsel, sample, "next_prio");
+
+	if (next_pid == 0)
+		thread = get_idle_thread(sample->cpu);
+	else
+		thread = machine__findnew_thread(machine, -1, next_pid);
+
+	if (thread == NULL)
+		return;
+
+	tr = thread__get_runtime(thread);
+	if (tr == NULL)
+		return;
+
+	tr->prio = next_prio;
+}
+
 static int timehist_sched_change_event(const struct perf_tool *tool,
 				       union perf_event *event,
 				       struct evsel *evsel,
@@ -2650,6 +2723,9 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
 		goto out;
 	}
 
+	if (sched->show_prio)
+		timehist_update_task_prio(evsel, sample, machine);
+
 	thread = timehist_get_thread(sched, sample, machine, evsel);
 	if (thread == NULL) {
 		rc = -1;
@@ -3684,6 +3760,7 @@ int cmd_sched(int argc, const char **argv)
 	OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
 		   "analyze events only for given thread id(s)"),
 	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
 	OPT_PARENT(sched_options)
 	};
 
-- 
cgit v1.2.3


From 9b3a48bbe20d9692e0d456488cd3a6244cac3e3b Mon Sep 17 00:00:00 2001
From: Yang Jihong <yangjihong@bytedance.com>
Date: Mon, 19 Aug 2024 11:30:16 +0800
Subject: perf sched timehist: Add --prio option

The --prio option is used to only show events for the given task priority(ies).
The default is to show events for all priority tasks, which is consistent with
the previous behavior.

Testcase:
  # perf sched record nice -n 9 perf bench sched messaging -l 10000
  # Running 'sched/messaging' benchmark:
  # 20 sender and receiver processes per group
  # 10 groups == 400 processes run

       Total time: 3.435 [sec]
  [ perf record: Woken up 270 times to write data ]
  [ perf record: Captured and wrote 618.688 MB perf.data (5729036 samples) ]

  # perf sched timehist -h

   Usage: perf sched timehist [<options>]

      -C, --cpu <cpu>       list of cpus to profile
      -D, --dump-raw-trace  dump raw trace in ASCII
      -f, --force           don't complain, do it
      -g, --call-graph      Display call chains if present (default on)
      -I, --idle-hist       Show idle events only
      -i, --input <file>    input file name
      -k, --vmlinux <file>  vmlinux pathname
      -M, --migrations      Show migration events
      -n, --next            Show next task
      -p, --pid <pid[,pid...]>
                            analyze events only for given process id(s)
      -s, --summary         Show only syscall summary with statistics
      -S, --with-summary    Show all syscalls and summary with statistics
      -t, --tid <tid[,tid...]>
                            analyze events only for given thread id(s)
      -V, --cpu-visual      Add CPU visual
      -v, --verbose         be more verbose (show symbol address, etc)
      -w, --wakeups         Show wakeup events
          --kallsyms <file>
                            kallsyms pathname
          --max-stack <n>   Maximum number of functions to display backtrace.
          --prio <prio>     analyze events only for given task priority(ies)
          --show-prio       Show task priority
          --state           Show task state when sched-out
          --symfs <directory>
                            Look for files with symbols relative to this directory
          --time <str>      Time span for analysis (start,stop)

  # perf sched timehist --prio 140
  Samples of sched_switch event do not have callchains.
  Invalid prio string

  # perf sched timehist --show-prio --prio 129
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       prio      wait time  sch delay   run time
                          [tid/pid]                                    (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  --------  ---------  ---------  ---------
   2090450.765421 [0002]  sched-messaging[1229618]        129           0.000      0.000      0.029
   2090450.765445 [0007]  sched-messaging[1229616]        129           0.000      0.062      0.043
   2090450.765448 [0014]  sched-messaging[1229619]        129           0.000      0.000      0.032
   2090450.765478 [0013]  sched-messaging[1229617]        129           0.000      0.065      0.048
   2090450.765503 [0014]  sched-messaging[1229622]        129           0.000      0.000      0.017
   2090450.765550 [0002]  sched-messaging[1229624]        129           0.000      0.000      0.021
   2090450.765562 [0007]  sched-messaging[1229621]        129           0.000      0.071      0.028
   2090450.765570 [0005]  sched-messaging[1229620]        129           0.000      0.064      0.066
   2090450.765583 [0001]  sched-messaging[1229625]        129           0.000      0.001      0.031
   2090450.765595 [0013]  sched-messaging[1229623]        129           0.000      0.060      0.028
   2090450.765637 [0014]  sched-messaging[1229628]        129           0.000      0.000      0.019
   2090450.765665 [0007]  sched-messaging[1229627]        129           0.000      0.038      0.030
  <SNIP>

  # perf sched timehist --show-prio --prio 0,120-129
  Samples of sched_switch event do not have callchains.
             time    cpu  task name                       prio      wait time  sch delay   run time
                          [tid/pid]                                    (msec)     (msec)     (msec)
  --------------- ------  ------------------------------  --------  ---------  ---------  ---------
   2090450.763231 [0000]  perf[1229608]                   120           0.000      0.000      0.000
   2090450.763235 [0000]  migration/0[15]                 0             0.000      0.001      0.003
   2090450.763263 [0001]  perf[1229608]                   120           0.000      0.000      0.000
   2090450.763268 [0001]  migration/1[21]                 0             0.000      0.001      0.004
   2090450.763302 [0002]  perf[1229608]                   120           0.000      0.000      0.000
   2090450.763309 [0002]  migration/2[27]                 0             0.000      0.001      0.007
   2090450.763338 [0003]  perf[1229608]                   120           0.000      0.000      0.000
   2090450.763343 [0003]  migration/3[33]                 0             0.000      0.001      0.004
   2090450.763459 [0004]  perf[1229608]                   120           0.000      0.000      0.000
   2090450.763469 [0004]  migration/4[39]                 0             0.000      0.002      0.010
   2090450.763496 [0005]  perf[1229608]                   120           0.000      0.000      0.000
   2090450.763501 [0005]  migration/5[45]                 0             0.000      0.001      0.004
   2090450.763613 [0006]  perf[1229608]                   120           0.000      0.000      0.000
   2090450.763622 [0006]  migration/6[51]                 0             0.000      0.001      0.008
   2090450.763652 [0007]  perf[1229608]                   120           0.000      0.000      0.000
   2090450.763660 [0007]  migration/7[57]                 0             0.000      0.001      0.008
  <SNIP>
   2090450.765665 [0001]  <idle>                          120           0.031      0.031      0.081
   2090450.765665 [0007]  sched-messaging[1229627]        129           0.000      0.038      0.030
   2090450.765667 [0000]  s1-perf[8235/7168]              120           0.008      0.000      0.004
   2090450.765684 [0013]  <idle>                          120           0.028      0.028      0.088
   2090450.765685 [0001]  sched-messaging[1229630]        129           0.000      0.001      0.020
   2090450.765688 [0000]  <idle>                          120           0.004      0.004      0.020
   2090450.765689 [0002]  <idle>                          120           0.021      0.021      0.138
   2090450.765691 [0005]  sched-messaging[1229626]        129           0.000      0.085      0.029

Signed-off-by: Yang Jihong <yangjihong@bytedance.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240819033016.2427235-3-yangjihong@bytedance.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-sched.txt |  6 +++
 tools/perf/builtin-sched.c              | 74 ++++++++++++++++++++++++++++++++-
 2 files changed, 79 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 3efa5c58418d..3db64954a267 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -215,6 +215,12 @@ OPTIONS for 'perf sched timehist'
 --show-prio::
 	Show task priority.
 
+--prio::
+	Only show events for given task priority(ies). Multiple priorities can be
+	provided as a comma-separated list with no spaces: 0,120. Ranges of
+	priorities are specified with -: 120-129. A combination of both can also be
+	provided: 0,120-129.
+
 OPTIONS for 'perf sched replay'
 ------------------------------
 
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ba6563dc93d0..0a7b2b2acd56 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -51,6 +51,7 @@
 #define COMM_LEN		20
 #define SYM_LEN			129
 #define MAX_PID			1024000
+#define MAX_PRIO		140
 
 static const char *cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -234,6 +235,8 @@ struct perf_sched {
 	struct perf_time_interval ptime;
 	struct perf_time_interval hist_time;
 	volatile bool   thread_funcs_exit;
+	const char	*prio_str;
+	DECLARE_BITMAP(prio_bitmap, MAX_PRIO);
 };
 
 /* per thread run time data */
@@ -2504,12 +2507,33 @@ static bool timehist_skip_sample(struct perf_sched *sched,
 				 struct perf_sample *sample)
 {
 	bool rc = false;
+	int prio = -1;
+	struct thread_runtime *tr = NULL;
 
 	if (thread__is_filtered(thread)) {
 		rc = true;
 		sched->skipped_samples++;
 	}
 
+	if (sched->prio_str) {
+		/*
+		 * Because priority may be changed during task execution,
+		 * first read priority from prev sched_in event for current task.
+		 * If prev sched_in event is not saved, then read priority from
+		 * current task sched_out event.
+		 */
+		tr = thread__get_runtime(thread);
+		if (tr && tr->prio != -1)
+			prio = tr->prio;
+		else if (evsel__name_is(evsel, "sched:sched_switch"))
+			prio = evsel__intval(evsel, sample, "prev_prio");
+
+		if (prio != -1 && !test_bit(prio, sched->prio_bitmap)) {
+			rc = true;
+			sched->skipped_samples++;
+		}
+	}
+
 	if (sched->idle_hist) {
 		if (!evsel__name_is(evsel, "sched:sched_switch"))
 			rc = true;
@@ -2723,7 +2747,7 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
 		goto out;
 	}
 
-	if (sched->show_prio)
+	if (sched->show_prio || sched->prio_str)
 		timehist_update_task_prio(evsel, sample, machine);
 
 	thread = timehist_get_thread(sched, sample, machine, evsel);
@@ -3143,6 +3167,47 @@ static int timehist_check_attr(struct perf_sched *sched,
 	return 0;
 }
 
+static int timehist_parse_prio_str(struct perf_sched *sched)
+{
+	char *p;
+	unsigned long start_prio, end_prio;
+	const char *str = sched->prio_str;
+
+	if (!str)
+		return 0;
+
+	while (isdigit(*str)) {
+		p = NULL;
+		start_prio = strtoul(str, &p, 0);
+		if (start_prio >= MAX_PRIO || (*p != '\0' && *p != ',' && *p != '-'))
+			return -1;
+
+		if (*p == '-') {
+			str = ++p;
+			p = NULL;
+			end_prio = strtoul(str, &p, 0);
+
+			if (end_prio >= MAX_PRIO || (*p != '\0' && *p != ','))
+				return -1;
+
+			if (end_prio < start_prio)
+				return -1;
+		} else {
+			end_prio = start_prio;
+		}
+
+		for (; start_prio <= end_prio; start_prio++)
+			__set_bit(start_prio, sched->prio_bitmap);
+
+		if (*p)
+			++p;
+
+		str = p;
+	}
+
+	return 0;
+}
+
 static int perf_sched__timehist(struct perf_sched *sched)
 {
 	struct evsel_str_handler handlers[] = {
@@ -3204,6 +3269,11 @@ static int perf_sched__timehist(struct perf_sched *sched)
 	if (timehist_check_attr(sched, evlist) != 0)
 		goto out;
 
+	if (timehist_parse_prio_str(sched) != 0) {
+		pr_err("Invalid prio string\n");
+		goto out;
+	}
+
 	setup_pager();
 
 	/* prefer sched_waking if it is captured */
@@ -3761,6 +3831,8 @@ int cmd_sched(int argc, const char **argv)
 		   "analyze events only for given thread id(s)"),
 	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
 	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
+	OPT_STRING(0, "prio", &sched.prio_str, "prio",
+		   "analyze events only for given task priority(ies)"),
 	OPT_PARENT(sched_options)
 	};
 
-- 
cgit v1.2.3


From beef8fb2af95ae8aa77f05a2663710d260741ddd Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 17 Jul 2024 17:30:20 -0700
Subject: perf pmu: Merge boolean sysfs event option parsing

Merge perf_pmu__parse_per_pkg() and perf_pmu__parse_snapshot() that do the
same parsing except for the file suffix used.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Link: https://lore.kernel.org/r/20240718003025.1486232-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/pmu.c | 47 +++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 0b38c51bd6eb..ef8ab918eaeb 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -367,8 +367,8 @@ error:
 	return -1;
 }
 
-static int
-perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
+static bool perf_pmu__parse_event_source_bool(const char *pmu_name, const char *event_name,
+					      const char *suffix)
 {
 	char path[PATH_MAX];
 	size_t len;
@@ -376,37 +376,36 @@ perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
 
 	len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path));
 	if (!len)
-		return 0;
-	scnprintf(path + len, sizeof(path) - len, "%s/events/%s.per-pkg", pmu->name, alias->name);
+		return false;
+
+	scnprintf(path + len, sizeof(path) - len, "%s/events/%s.%s", pmu_name, event_name, suffix);
 
 	fd = open(path, O_RDONLY);
 	if (fd == -1)
-		return -1;
+		return false;
 
-	close(fd);
+#ifndef NDEBUG
+	{
+		char buf[8];
 
-	alias->per_pkg = true;
-	return 0;
+		len = read(fd, buf, sizeof(buf));
+		assert(len == 1 || len == 2);
+		assert(buf[0] == '1');
+	}
+#endif
+
+	close(fd);
+	return true;
 }
 
-static int perf_pmu__parse_snapshot(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
+static void perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
 {
-	char path[PATH_MAX];
-	size_t len;
-	int fd;
-
-	len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path));
-	if (!len)
-		return 0;
-	scnprintf(path + len, sizeof(path) - len, "%s/events/%s.snapshot", pmu->name, alias->name);
-
-	fd = open(path, O_RDONLY);
-	if (fd == -1)
-		return -1;
+	alias->per_pkg = perf_pmu__parse_event_source_bool(pmu->name, alias->name, "per-pkg");
+}
 
-	alias->snapshot = true;
-	close(fd);
-	return 0;
+static void perf_pmu__parse_snapshot(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
+{
+	alias->snapshot = perf_pmu__parse_event_source_bool(pmu->name, alias->name, "snapshot");
 }
 
 /* Delete an alias entry. */
-- 
cgit v1.2.3


From f76e3525acf395f22712fc9120de555fb4b1a52f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 17 Jul 2024 17:30:21 -0700
Subject: perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event()

Previously the cpu_list is a string and typically no cpu_list is
passed to __add_event().

Wanting to make events have their cpus distinct from the PMU means that
in more occassions we want to pass a cpu_list.

If we're reading this from sysfs it is easier to read a perf_cpu_map
than allocate and pass around strings that will later be parsed.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Link: https://lore.kernel.org/r/20240718003025.1486232-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index fab01ba54e34..9c0ce01684c3 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -227,12 +227,12 @@ __add_event(struct list_head *list, int *idx,
 	    bool init_attr,
 	    const char *name, const char *metric_id, struct perf_pmu *pmu,
 	    struct list_head *config_terms, bool auto_merge_stats,
-	    const char *cpu_list)
+	    struct perf_cpu_map *cpu_list)
 {
 	struct evsel *evsel;
-	struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) :
-			       cpu_list ? perf_cpu_map__new(cpu_list) : NULL;
+	struct perf_cpu_map *cpus = perf_cpu_map__is_empty(cpu_list) && pmu ? pmu->cpus : cpu_list;
 
+	cpus = perf_cpu_map__get(cpus);
 	if (pmu)
 		perf_pmu__warn_invalid_formats(pmu);
 
@@ -305,16 +305,17 @@ static int add_event_tool(struct list_head *list, int *idx,
 		.type = PERF_TYPE_SOFTWARE,
 		.config = PERF_COUNT_SW_DUMMY,
 	};
-	const char *cpu_list = NULL;
+	struct perf_cpu_map *cpu_list = NULL;
 
 	if (tool_event == PERF_TOOL_DURATION_TIME) {
 		/* Duration time is gathered globally, pretend it is only on CPU0. */
-		cpu_list = "0";
+		cpu_list = perf_cpu_map__new("0");
 	}
 	evsel = __add_event(list, idx, &attr, /*init_attr=*/true, /*name=*/NULL,
 			    /*metric_id=*/NULL, /*pmu=*/NULL,
 			    /*config_terms=*/NULL, /*auto_merge_stats=*/false,
 			    cpu_list);
+	perf_cpu_map__put(cpu_list);
 	if (!evsel)
 		return -ENOMEM;
 	evsel->tool_event = tool_event;
-- 
cgit v1.2.3


From 1c1d348b08ab7c106dd96bafb120f0888827174b Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Tue, 27 Aug 2024 17:51:23 +0800
Subject: tools/testing/cxl: Use dev_is_platform()

Use dev_is_platform() instead of checking bus type directly.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/20240827095123.168696-1-kunwu.chan@linux.dev
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 tools/testing/cxl/mock_acpi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/cxl/mock_acpi.c b/tools/testing/cxl/mock_acpi.c
index 55813de26d46..8da94378ccec 100644
--- a/tools/testing/cxl/mock_acpi.c
+++ b/tools/testing/cxl/mock_acpi.c
@@ -18,7 +18,7 @@ struct acpi_device *to_cxl_host_bridge(struct device *host, struct device *dev)
 		goto out;
 	}
 
-	if (dev->bus == &platform_bus_type)
+	if (dev_is_platform(dev))
 		goto out;
 
 	adev = to_acpi_device(dev);
-- 
cgit v1.2.3


From 1b2965a8cd8d444cbea891e55083b5987d00cc66 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 2 Sep 2024 12:45:55 +0200
Subject: selftests: mptcp: join: reduce join_nr params

chk_join_nr() currently takes 9 positional parameters, 6 of them are
optional. It makes it hard to read:

  chk_join_nr 1 1 1 1 0 1 1 0 4

Naming these vars helps to make it easier to read:

  join_csum_ns1=1 join_csum_ns2=0 \
    join_fail_nr=1 join_rst_nr=1 join_infi_nr=0 \
    join_corrupted_pkts=4 \
    chk_join_nr 1 1 1

It will then be easier to add new optional parameters.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-4-d3e0f3773b90@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 31 ++++++++++++++++++-------
 1 file changed, 22 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index a4762c49a878..51b226784c6b 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -61,6 +61,12 @@ unset sflags
 unset fastclose
 unset fullmesh
 unset speed
+unset join_csum_ns1
+unset join_csum_ns2
+unset join_fail_nr
+unset join_rst_nr
+unset join_infi_nr
+unset join_corrupted_pkts
 
 # generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) ||
 #				  (ip6 && (ip6[74] & 0xf0) == 0x30)'"
@@ -1319,12 +1325,12 @@ chk_join_nr()
 	local syn_nr=$1
 	local syn_ack_nr=$2
 	local ack_nr=$3
-	local csum_ns1=${4:-0}
-	local csum_ns2=${5:-0}
-	local fail_nr=${6:-0}
-	local rst_nr=${7:-0}
-	local infi_nr=${8:-0}
-	local corrupted_pkts=${9:-0}
+	local csum_ns1=${join_csum_ns1:-0}
+	local csum_ns2=${join_csum_ns2:-0}
+	local fail_nr=${join_fail_nr:-0}
+	local rst_nr=${join_rst_nr:-0}
+	local infi_nr=${join_infi_nr:-0}
+	local corrupted_pkts=${join_corrupted_pkts:-0}
 	local count
 	local with_cookie
 
@@ -3164,7 +3170,8 @@ fastclose_tests()
 		MPTCP_LIB_SUBTEST_FLAKY=1
 		test_linkfail=1024 fastclose=server \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0 0 0 0 1
+		join_rst_nr=1 \
+			chk_join_nr 0 0 0
 		chk_fclose_nr 1 1 invert
 		chk_rst_nr 1 1
 	fi
@@ -3183,7 +3190,10 @@ fail_tests()
 		MPTCP_LIB_SUBTEST_FLAKY=1
 		test_linkfail=128 \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)"
+		join_csum_ns1=+1 join_csum_ns2=+0 \
+			join_fail_nr=1 join_rst_nr=0 join_infi_nr=1 \
+			join_corrupted_pkts="$(pedit_action_pkts)" \
+			chk_join_nr 0 0 0
 		chk_fail_nr 1 -1 invert
 	fi
 
@@ -3196,7 +3206,10 @@ fail_tests()
 		pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow
 		test_linkfail=1024 \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1 1 0 1 1 0 "$(pedit_action_pkts)"
+		join_csum_ns1=1 join_csum_ns2=0 \
+			join_fail_nr=1 join_rst_nr=1 join_infi_nr=0 \
+			join_corrupted_pkts="$(pedit_action_pkts)" \
+			chk_join_nr 1 1 1
 	fi
 }
 
-- 
cgit v1.2.3


From ba8a664004da84e4e04205429043a1bc19e00d23 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 2 Sep 2024 12:45:56 +0200
Subject: selftests: mptcp: join: one line for join check

Most tests are checking if the expected number of SYN/SYN+ACK/ACK JOINs
have been received, each of them on one line.

More Join related tests are going to be checked soon, no need to add 5
new lines per test in case of success, just one is enough. In case of
issue, the errors will still be reported like before.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-5-d3e0f3773b90@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 45 ++++++++++++++++---------
 1 file changed, 30 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 51b226784c6b..63580a5810bf 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -202,6 +202,22 @@ print_skip()
 	mptcp_lib_pr_skip "${@}"
 }
 
+# $1: check name; $2: rc
+print_results()
+{
+	local check="${1}"
+	local rc=${2}
+
+	print_check "${check}"
+	if [ ${rc} = ${KSFT_PASS} ]; then
+		print_ok
+	elif [ ${rc} = ${KSFT_SKIP} ]; then
+		print_skip
+	else
+		fail_test "see above"
+	fi
+}
+
 # [ $1: fail msg ]
 mark_as_skipped()
 {
@@ -1331,6 +1347,7 @@ chk_join_nr()
 	local rst_nr=${join_rst_nr:-0}
 	local infi_nr=${join_infi_nr:-0}
 	local corrupted_pkts=${join_corrupted_pkts:-0}
+	local rc=${KSFT_PASS}
 	local count
 	local with_cookie
 
@@ -1338,43 +1355,41 @@ chk_join_nr()
 		print_info "${corrupted_pkts} corrupted pkts"
 	fi
 
-	print_check "syn"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynRx")
 	if [ -z "$count" ]; then
-		print_skip
+		rc=${KSFT_SKIP}
 	elif [ "$count" != "$syn_nr" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn"
 		fail_test "got $count JOIN[s] syn expected $syn_nr"
-	else
-		print_ok
 	fi
 
-	print_check "synack"
 	with_cookie=$(ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies)
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckRx")
 	if [ -z "$count" ]; then
-		print_skip
+		rc=${KSFT_SKIP}
 	elif [ "$count" != "$syn_ack_nr" ]; then
 		# simult connections exceeding the limit with cookie enabled could go up to
 		# synack validation as the conn limit can be enforced reliably only after
 		# the subflow creation
-		if [ "$with_cookie" = 2 ] && [ "$count" -gt "$syn_ack_nr" ] && [ "$count" -le "$syn_nr" ]; then
-			print_ok
-		else
+		if [ "$with_cookie" != 2 ] || [ "$count" -le "$syn_ack_nr" ] || [ "$count" -gt "$syn_nr" ]; then
+			rc=${KSFT_FAIL}
+			print_check "synack"
 			fail_test "got $count JOIN[s] synack expected $syn_ack_nr"
 		fi
-	else
-		print_ok
 	fi
 
-	print_check "ack"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckRx")
 	if [ -z "$count" ]; then
-		print_skip
+		rc=${KSFT_SKIP}
 	elif [ "$count" != "$ack_nr" ]; then
+		rc=${KSFT_FAIL}
+		print_check "ack"
 		fail_test "got $count JOIN[s] ack expected $ack_nr"
-	else
-		print_ok
 	fi
+
+	print_results "join Rx" ${rc}
+
 	if $validate_checksum; then
 		chk_csum_nr $csum_ns1 $csum_ns2
 		chk_fail_nr $fail_nr $fail_nr
-- 
cgit v1.2.3


From 004125c251a6826d080bb28feb48ebc5454ada81 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 2 Sep 2024 12:45:57 +0200
Subject: selftests: mptcp: join: validate MPJ SYN TX MIB counters

A few new MPJoinSynTx MIB counters have been added in a previous commit.
They are being validated here in mptcp_join.sh selftest, each time the
number of received MPJ are checked.

Most of the time, the number of sent SYN+MPJ is the same as the received
ones. But sometimes, there are more, because there are dropped, or there
are errors.

While at it, the "no MPC reuse with single endpoint" subtest has been
modified to force a bind() error.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-6-d3e0f3773b90@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 89 ++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 63580a5810bf..23f8e2254064 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -67,6 +67,10 @@ unset join_fail_nr
 unset join_rst_nr
 unset join_infi_nr
 unset join_corrupted_pkts
+unset join_syn_tx
+unset join_create_err
+unset join_bind_err
+unset join_connect_err
 
 # generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) ||
 #				  (ip6 && (ip6[74] & 0xf0) == 0x30)'"
@@ -1336,6 +1340,54 @@ chk_infi_nr()
 	fi
 }
 
+chk_join_tx_nr()
+{
+	local syn_tx=${join_syn_tx:-0}
+	local create=${join_create_err:-0}
+	local bind=${join_bind_err:-0}
+	local connect=${join_connect_err:-0}
+	local rc=${KSFT_PASS}
+	local count
+
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTx")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$syn_tx" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn tx"
+		fail_test "got $count JOIN[s] syn tx expected $syn_tx"
+	fi
+
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxCreatSkErr")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$create" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn tx create socket error"
+		fail_test "got $count JOIN[s] syn tx create socket error expected $create"
+	fi
+
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxBindErr")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$bind" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn tx bind error"
+		fail_test "got $count JOIN[s] syn tx bind error expected $bind"
+	fi
+
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxConnectErr")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$connect" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn tx connect error"
+		fail_test "got $count JOIN[s] syn tx connect error expected $connect"
+	fi
+
+	print_results "join Tx" ${rc}
+}
+
 chk_join_nr()
 {
 	local syn_nr=$1
@@ -1390,6 +1442,9 @@ chk_join_nr()
 
 	print_results "join Rx" ${rc}
 
+	join_syn_tx="${join_syn_tx:-${syn_nr}}" \
+		chk_join_tx_nr
+
 	if $validate_checksum; then
 		chk_csum_nr $csum_ns1 $csum_ns2
 		chk_fail_nr $fail_nr $fail_nr
@@ -1930,9 +1985,11 @@ subflows_error_tests()
 		pm_nl_set_limits $ns1 0 1
 		pm_nl_set_limits $ns2 0 1
 		pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow
+		pm_nl_add_endpoint $ns2 10.0.12.2 flags subflow
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
+		join_bind_err=1 \
+			chk_join_nr 0 0 0
 	fi
 
 	# multiple subflows, with subflow creation error
@@ -1944,7 +2001,8 @@ subflows_error_tests()
 		pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=2 \
+			chk_join_nr 1 1 1
 	fi
 
 	# multiple subflows, with subflow timeout on MPJ
@@ -1956,7 +2014,8 @@ subflows_error_tests()
 		pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=2 \
+			chk_join_nr 1 1 1
 	fi
 
 	# multiple subflows, check that the endpoint corresponding to
@@ -1977,7 +2036,8 @@ subflows_error_tests()
 
 		# additional subflow could be created only if the PM select
 		# the later endpoint, skipping the already used one
-		chk_join_nr 1 1 1
+		join_syn_tx=2 \
+			chk_join_nr 1 1 1
 	fi
 }
 
@@ -2063,7 +2123,8 @@ signal_address_tests()
 		pm_nl_add_endpoint $ns1 10.0.14.1 flags signal
 		pm_nl_set_limits $ns2 3 3
 		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=3 \
+			chk_join_nr 1 1 1
 		chk_add_nr 3 3
 	fi
 
@@ -2231,7 +2292,8 @@ add_addr_timeout_tests()
 		pm_nl_set_limits $ns2 2 2
 		speed=10 \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=2 \
+			chk_join_nr 1 1 1
 		chk_add_nr 8 0
 	fi
 }
@@ -2331,7 +2393,8 @@ remove_tests()
 		pm_nl_set_limits $ns2 2 2
 		addr_nr_ns1=-3 speed=10 \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=2 join_connect_err=1 \
+			chk_join_nr 1 1 1
 		chk_add_nr 3 3
 		chk_rm_nr 3 1 invert
 		chk_rst_nr 0 0
@@ -2396,7 +2459,8 @@ remove_tests()
 		pm_nl_set_limits $ns2 3 3
 		addr_nr_ns1=-8 speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=3 \
+			chk_join_nr 1 1 1
 		chk_add_nr 3 3
 		chk_rm_nr 3 1 invert
 		chk_rst_nr 0 0
@@ -3703,7 +3767,8 @@ endpoint_tests()
 		chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6
 		chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 5 # one has been closed before estab
 
-		chk_join_nr 6 6 6
+		join_syn_tx=7 \
+			chk_join_nr 6 6 6
 		chk_rm_nr 4 4
 	fi
 
@@ -3775,7 +3840,8 @@ endpoint_tests()
 		chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5
 		chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 3
 
-		chk_join_nr 5 5 5
+		join_connect_err=1 \
+			chk_join_nr 5 5 5
 		chk_add_nr 6 6
 		chk_rm_nr 4 3 invert
 	fi
@@ -3806,7 +3872,8 @@ endpoint_tests()
 		wait_mpj $ns2
 		mptcp_lib_kill_wait $tests_pid
 
-		chk_join_nr 2 2 2
+		join_syn_tx=3 join_connect_err=1 \
+			chk_join_nr 2 2 2
 		chk_add_nr 2 2
 		chk_rm_nr 1 0 invert
 	fi
-- 
cgit v1.2.3


From 6ed495345be8113899986d21c6cbc7d8f550dedb Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 2 Sep 2024 12:45:58 +0200
Subject: selftests: mptcp: join: more explicit check name

Before, the check names had to be very short. It is no longer the case
now that these checks are printed on a dedicated line.

Then, it looks better to have more explicit names.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-7-d3e0f3773b90@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 65 +++++++++++++------------
 1 file changed, 33 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 23f8e2254064..7993e0e0029e 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -865,7 +865,7 @@ chk_cestab_nr()
 	local cestab=$2
 	local count
 
-	print_check "cestab $cestab"
+	print_check "currently established: $cestab"
 	count=$(mptcp_lib_get_counter ${ns} "MPTcpExtMPCurrEstab")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1141,7 +1141,7 @@ chk_csum_nr()
 		csum_ns2=${csum_ns2:1}
 	fi
 
-	print_check "sum"
+	print_check "checksum server"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtDataCsumErr")
 	if [ -n "$count" ] && [ "$count" != "$csum_ns1" ]; then
 		extra_msg+=" ns1=$count"
@@ -1154,7 +1154,8 @@ chk_csum_nr()
 	else
 		print_ok
 	fi
-	print_check "csum"
+
+	print_check "checksum client"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtDataCsumErr")
 	if [ -n "$count" ] && [ "$count" != "$csum_ns2" ]; then
 		extra_msg+=" ns2=$count"
@@ -1198,7 +1199,7 @@ chk_fail_nr()
 		fail_rx=${fail_rx:1}
 	fi
 
-	print_check "ftx"
+	print_check "fail tx"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFailTx")
 	if [ -n "$count" ] && [ "$count" != "$fail_tx" ]; then
 		extra_msg+=",tx=$count"
@@ -1212,7 +1213,7 @@ chk_fail_nr()
 		print_ok
 	fi
 
-	print_check "failrx"
+	print_check "fail rx"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFailRx")
 	if [ -n "$count" ] && [ "$count" != "$fail_rx" ]; then
 		extra_msg+=",rx=$count"
@@ -1245,7 +1246,7 @@ chk_fclose_nr()
 		extra_msg="invert"
 	fi
 
-	print_check "ctx"
+	print_check "fast close tx"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFastcloseTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1256,7 +1257,7 @@ chk_fclose_nr()
 		print_ok
 	fi
 
-	print_check "fclzrx"
+	print_check "fast close rx"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFastcloseRx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1286,7 +1287,7 @@ chk_rst_nr()
 		extra_msg="invert"
 	fi
 
-	print_check "rtx"
+	print_check "reset tx"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPRstTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1298,7 +1299,7 @@ chk_rst_nr()
 		print_ok
 	fi
 
-	print_check "rstrx"
+	print_check "reset rx"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPRstRx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1319,7 +1320,7 @@ chk_infi_nr()
 	local infi_rx=$2
 	local count
 
-	print_check "itx"
+	print_check "infi tx"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtInfiniteMapTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1329,7 +1330,7 @@ chk_infi_nr()
 		print_ok
 	fi
 
-	print_check "infirx"
+	print_check "infi rx"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtInfiniteMapRx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1412,8 +1413,8 @@ chk_join_nr()
 		rc=${KSFT_SKIP}
 	elif [ "$count" != "$syn_nr" ]; then
 		rc=${KSFT_FAIL}
-		print_check "syn"
-		fail_test "got $count JOIN[s] syn expected $syn_nr"
+		print_check "syn rx"
+		fail_test "got $count JOIN[s] syn rx expected $syn_nr"
 	fi
 
 	with_cookie=$(ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies)
@@ -1426,8 +1427,8 @@ chk_join_nr()
 		# the subflow creation
 		if [ "$with_cookie" != 2 ] || [ "$count" -le "$syn_ack_nr" ] || [ "$count" -gt "$syn_nr" ]; then
 			rc=${KSFT_FAIL}
-			print_check "synack"
-			fail_test "got $count JOIN[s] synack expected $syn_ack_nr"
+			print_check "synack rx"
+			fail_test "got $count JOIN[s] synack rx expected $syn_ack_nr"
 		fi
 	fi
 
@@ -1436,8 +1437,8 @@ chk_join_nr()
 		rc=${KSFT_SKIP}
 	elif [ "$count" != "$ack_nr" ]; then
 		rc=${KSFT_FAIL}
-		print_check "ack"
-		fail_test "got $count JOIN[s] ack expected $ack_nr"
+		print_check "ack rx"
+		fail_test "got $count JOIN[s] ack rx expected $ack_nr"
 	fi
 
 	print_results "join Rx" ${rc}
@@ -1517,7 +1518,7 @@ chk_add_nr()
 
 	timeout=$(ip netns exec ${ns_tx} sysctl -n net.mptcp.add_addr_timeout)
 
-	print_check "add"
+	print_check "add addr rx"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtAddAddr")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1529,7 +1530,7 @@ chk_add_nr()
 		print_ok
 	fi
 
-	print_check "echo"
+	print_check "add addr echo rx"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtEchoAdd")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1540,7 +1541,7 @@ chk_add_nr()
 	fi
 
 	if [ $port_nr -gt 0 ]; then
-		print_check "pt"
+		print_check "add addr rx with port"
 		count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtPortAdd")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1550,7 +1551,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "syn"
+		print_check "syn rx port"
 		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortSynRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1561,7 +1562,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "synack"
+		print_check "synack rx port"
 		count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPJoinPortSynAckRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1572,7 +1573,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "ack"
+		print_check "ack rx port"
 		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortAckRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1583,7 +1584,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "syn"
+		print_check "syn rx port mismatch"
 		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortSynRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1594,7 +1595,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "ack"
+		print_check "ack rx port mismatch"
 		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortAckRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1618,7 +1619,7 @@ chk_add_tx_nr()
 
 	timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout)
 
-	print_check "add TX"
+	print_check "add addr tx"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtAddAddrTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1630,7 +1631,7 @@ chk_add_tx_nr()
 		print_ok
 	fi
 
-	print_check "echo TX"
+	print_check "add addr echo tx"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtEchoAddTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1668,7 +1669,7 @@ chk_rm_nr()
 		extra_msg="invert"
 	fi
 
-	print_check "rm"
+	print_check "rm addr rx"
 	count=$(mptcp_lib_get_counter ${addr_ns} "MPTcpExtRmAddr")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1678,7 +1679,7 @@ chk_rm_nr()
 		print_ok
 	fi
 
-	print_check "rmsf"
+	print_check "rm subflow"
 	count=$(mptcp_lib_get_counter ${subflow_ns} "MPTcpExtRmSubflow")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1713,7 +1714,7 @@ chk_rm_tx_nr()
 {
 	local rm_addr_tx_nr=$1
 
-	print_check "rm TX"
+	print_check "rm addr tx"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtRmAddrTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1732,7 +1733,7 @@ chk_prio_nr()
 	local mpj_syn_ack=$4
 	local count
 
-	print_check "ptx"
+	print_check "mp_prio tx"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1742,7 +1743,7 @@ chk_prio_nr()
 		print_ok
 	fi
 
-	print_check "prx"
+	print_check "mp_prio rx"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioRx")
 	if [ -z "$count" ]; then
 		print_skip
-- 
cgit v1.2.3


From 8d328dbcf61b50ca51f8a930561fe6ae2c8bf5e9 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 2 Sep 2024 12:45:59 +0200
Subject: selftests: mptcp: join: specify host being checked

Instead of displaying 'invert' when looking at some events like MP_FAIL,
MP_FASTCLOSE, MP_RESET, RM_ADDR, which is a bit vague because they are
not traditionnaly sent from one side, the host being checked is now
printed.

For the ADD_ADDR, only display the host when it is the client sending
it, which is more unusual.

Also before, the 'invert' message was printed after a few checks, but it
was not clear which ones exactly.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-8-d3e0f3773b90@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 89 +++++++++++++------------
 1 file changed, 45 insertions(+), 44 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 7993e0e0029e..321197d8977e 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1180,6 +1180,8 @@ chk_fail_nr()
 	local count
 	local ns_tx=$ns1
 	local ns_rx=$ns2
+	local tx="server"
+	local rx="client"
 	local extra_msg=""
 	local allow_tx_lost=0
 	local allow_rx_lost=0
@@ -1187,7 +1189,8 @@ chk_fail_nr()
 	if [[ $ns_invert = "invert" ]]; then
 		ns_tx=$ns2
 		ns_rx=$ns1
-		extra_msg="invert"
+		tx="client"
+		rx="server"
 	fi
 
 	if [[ "${fail_tx}" = "-"* ]]; then
@@ -1199,10 +1202,10 @@ chk_fail_nr()
 		fail_rx=${fail_rx:1}
 	fi
 
-	print_check "fail tx"
+	print_check "fail tx ${tx}"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFailTx")
 	if [ -n "$count" ] && [ "$count" != "$fail_tx" ]; then
-		extra_msg+=",tx=$count"
+		extra_msg+=" tx=$count"
 	fi
 	if [ -z "$count" ]; then
 		print_skip
@@ -1213,10 +1216,10 @@ chk_fail_nr()
 		print_ok
 	fi
 
-	print_check "fail rx"
+	print_check "fail rx ${rx}"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFailRx")
 	if [ -n "$count" ] && [ "$count" != "$fail_rx" ]; then
-		extra_msg+=",rx=$count"
+		extra_msg+=" rx=$count"
 	fi
 	if [ -z "$count" ]; then
 		print_skip
@@ -1238,37 +1241,35 @@ chk_fclose_nr()
 	local count
 	local ns_tx=$ns2
 	local ns_rx=$ns1
-	local extra_msg=""
+	local tx="client"
+	local rx="server"
 
 	if [[ $ns_invert = "invert" ]]; then
 		ns_tx=$ns1
 		ns_rx=$ns2
-		extra_msg="invert"
+		tx="server"
+		rx="client"
 	fi
 
-	print_check "fast close tx"
+	print_check "fast close tx ${tx}"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFastcloseTx")
 	if [ -z "$count" ]; then
 		print_skip
 	elif [ "$count" != "$fclose_tx" ]; then
-		extra_msg+=",tx=$count"
 		fail_test "got $count MP_FASTCLOSE[s] TX expected $fclose_tx"
 	else
 		print_ok
 	fi
 
-	print_check "fast close rx"
+	print_check "fast close rx ${rx}"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFastcloseRx")
 	if [ -z "$count" ]; then
 		print_skip
 	elif [ "$count" != "$fclose_rx" ]; then
-		extra_msg+=",rx=$count"
 		fail_test "got $count MP_FASTCLOSE[s] RX expected $fclose_rx"
 	else
 		print_ok
 	fi
-
-	print_info "$extra_msg"
 }
 
 chk_rst_nr()
@@ -1279,15 +1280,17 @@ chk_rst_nr()
 	local count
 	local ns_tx=$ns1
 	local ns_rx=$ns2
-	local extra_msg=""
+	local tx="server"
+	local rx="client"
 
 	if [[ $ns_invert = "invert" ]]; then
 		ns_tx=$ns2
 		ns_rx=$ns1
-		extra_msg="invert"
+		tx="client"
+		rx="server"
 	fi
 
-	print_check "reset tx"
+	print_check "reset tx ${tx}"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPRstTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1299,7 +1302,7 @@ chk_rst_nr()
 		print_ok
 	fi
 
-	print_check "reset rx"
+	print_check "reset rx ${rx}"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPRstRx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1310,8 +1313,6 @@ chk_rst_nr()
 	else
 		print_ok
 	fi
-
-	print_info "$extra_msg"
 }
 
 chk_infi_nr()
@@ -1320,7 +1321,7 @@ chk_infi_nr()
 	local infi_rx=$2
 	local count
 
-	print_check "infi tx"
+	print_check "infi tx client"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtInfiniteMapTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1330,7 +1331,7 @@ chk_infi_nr()
 		print_ok
 	fi
 
-	print_check "infi rx"
+	print_check "infi rx server"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtInfiniteMapRx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1506,19 +1507,21 @@ chk_add_nr()
 	local mis_ack_nr=0
 	local ns_tx=$ns1
 	local ns_rx=$ns2
-	local extra_msg=""
+	local tx=""
+	local rx=""
 	local count
 	local timeout
 
 	if [[ $ns_invert = "invert" ]]; then
 		ns_tx=$ns2
 		ns_rx=$ns1
-		extra_msg="invert"
+		tx=" client"
+		rx=" server"
 	fi
 
 	timeout=$(ip netns exec ${ns_tx} sysctl -n net.mptcp.add_addr_timeout)
 
-	print_check "add addr rx"
+	print_check "add addr rx${rx}"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtAddAddr")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1530,7 +1533,7 @@ chk_add_nr()
 		print_ok
 	fi
 
-	print_check "add addr echo rx"
+	print_check "add addr echo rx${tx}"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtEchoAdd")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1541,7 +1544,7 @@ chk_add_nr()
 	fi
 
 	if [ $port_nr -gt 0 ]; then
-		print_check "add addr rx with port"
+		print_check "add addr rx with port${rx}"
 		count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtPortAdd")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1551,7 +1554,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "syn rx port"
+		print_check "syn rx port${tx}"
 		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortSynRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1562,7 +1565,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "synack rx port"
+		print_check "synack rx port${rx}"
 		count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPJoinPortSynAckRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1573,7 +1576,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "ack rx port"
+		print_check "ack rx port${tx}"
 		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortAckRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1584,7 +1587,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "syn rx port mismatch"
+		print_check "syn rx port mismatch${tx}"
 		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortSynRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1595,7 +1598,7 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "ack rx port mismatch"
+		print_check "ack rx port mismatch${tx}"
 		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortAckRx")
 		if [ -z "$count" ]; then
 			print_skip
@@ -1606,8 +1609,6 @@ chk_add_nr()
 			print_ok
 		fi
 	fi
-
-	print_info "$extra_msg"
 }
 
 chk_add_tx_nr()
@@ -1651,6 +1652,8 @@ chk_rm_nr()
 	local count
 	local addr_ns=$ns1
 	local subflow_ns=$ns2
+	local addr="server"
+	local subflow="client"
 	local extra_msg=""
 
 	shift 2
@@ -1660,16 +1663,14 @@ chk_rm_nr()
 		shift
 	done
 
-	if [ -z $invert ]; then
-		addr_ns=$ns1
-		subflow_ns=$ns2
-	elif [ $invert = "true" ]; then
+	if [ "$invert" = "true" ]; then
 		addr_ns=$ns2
 		subflow_ns=$ns1
-		extra_msg="invert"
+		addr="client"
+		subflow="server"
 	fi
 
-	print_check "rm addr rx"
+	print_check "rm addr rx ${addr}"
 	count=$(mptcp_lib_get_counter ${addr_ns} "MPTcpExtRmAddr")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1679,7 +1680,7 @@ chk_rm_nr()
 		print_ok
 	fi
 
-	print_check "rm subflow"
+	print_check "rm subflow ${subflow}"
 	count=$(mptcp_lib_get_counter ${subflow_ns} "MPTcpExtRmSubflow")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1693,7 +1694,7 @@ chk_rm_nr()
 		count=$((count + cnt))
 		if [ "$count" != "$rm_subflow_nr" ]; then
 			suffix="$count in [$rm_subflow_nr:$((rm_subflow_nr*2))]"
-			extra_msg+=" simult"
+			extra_msg="simult"
 		fi
 		if [ $count -ge "$rm_subflow_nr" ] && \
 		   [ "$count" -le "$((rm_subflow_nr *2 ))" ]; then
@@ -1714,7 +1715,7 @@ chk_rm_tx_nr()
 {
 	local rm_addr_tx_nr=$1
 
-	print_check "rm addr tx"
+	print_check "rm addr tx client"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtRmAddrTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1733,7 +1734,7 @@ chk_prio_nr()
 	local mpj_syn_ack=$4
 	local count
 
-	print_check "mp_prio tx"
+	print_check "mp_prio tx server"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1743,7 +1744,7 @@ chk_prio_nr()
 		print_ok
 	fi
 
-	print_check "mp_prio rx"
+	print_check "mp_prio rx client"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioRx")
 	if [ -z "$count" ]; then
 		print_skip
-- 
cgit v1.2.3


From 08eecd7e7fe79669ccab44dde518b10a2f7e48a7 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 2 Sep 2024 12:46:00 +0200
Subject: selftests: mptcp: join: mute errors when ran in the background

The test is supposed to be killed before the end, which will likely
cause "Connection reset by peer" errors. It is confusing, especially
because in case of real transfer errors, the test will not be marked as
failed. But that's OK, there are many other tests checking that.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-9-d3e0f3773b90@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 36 ++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 321197d8977e..5d164abc18e5 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3542,8 +3542,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns1
 		pm_nl_set_limits $ns2 2 2
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns1
 		userspace_pm_add_addr $ns1 10.0.2.1 10
@@ -3575,8 +3575,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns2
 		pm_nl_set_limits $ns1 0 1
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns2
 		userspace_pm_add_sf $ns2 10.0.3.2 20
@@ -3603,8 +3603,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns2
 		pm_nl_set_limits $ns1 0 1
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns2
 		chk_mptcp_info subflows 0 subflows 0
@@ -3624,8 +3624,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns2
 		pm_nl_set_limits $ns1 0 1
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns2
 		userspace_pm_add_sf $ns2 10.0.3.2 20
@@ -3648,8 +3648,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns1
 		pm_nl_set_limits $ns2 1 1
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns1
 		userspace_pm_add_addr $ns1 10.0.2.1 10
@@ -3679,8 +3679,8 @@ endpoint_tests()
 		pm_nl_set_limits $ns1 2 2
 		pm_nl_set_limits $ns2 2 2
 		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
-		speed=slow \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=slow \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
 		wait_mpj $ns1
@@ -3706,8 +3706,8 @@ endpoint_tests()
 		pm_nl_set_limits $ns2 0 3
 		pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow
 		pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
-		test_linkfail=4 speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ test_linkfail=4 speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
 		wait_mpj $ns2
@@ -3783,8 +3783,8 @@ endpoint_tests()
 		# broadcast IP: no packet for this address will be received on ns1
 		pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
 		pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal
-		test_linkfail=4 speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ test_linkfail=4 speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
 		wait_mpj $ns2
@@ -3856,8 +3856,8 @@ endpoint_tests()
 		# broadcast IP: no packet for this address will be received on ns1
 		pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
 		pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
-		test_linkfail=4 speed=20 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ test_linkfail=4 speed=20 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
 		wait_attempt_fail $ns2
-- 
cgit v1.2.3


From 0e2b4584d61abe8dc22db18e83c85429782793ee Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Mon, 2 Sep 2024 12:46:01 +0200
Subject: selftests: mptcp: join: simplify checksum_tests

The four checksum tests are similar, only one line is different. So
a for-loop can be used to simplify these tests.

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-10-d3e0f3773b90@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 43 +++++++------------------
 1 file changed, 11 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 5d164abc18e5..43f8a9bd84c4 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -363,7 +363,7 @@ reset_with_checksum()
 	local ns1_enable=$1
 	local ns2_enable=$2
 
-	reset "checksum test ${1} ${2}" || return 1
+	reset "checksum test ${ns1_enable} ${ns2_enable}" || return 1
 
 	ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=$ns1_enable
 	ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=$ns2_enable
@@ -3032,37 +3032,16 @@ syncookies_tests()
 
 checksum_tests()
 {
-	# checksum test 0 0
-	if reset_with_checksum 0 0; then
-		pm_nl_set_limits $ns1 0 1
-		pm_nl_set_limits $ns2 0 1
-		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-	fi
-
-	# checksum test 1 1
-	if reset_with_checksum 1 1; then
-		pm_nl_set_limits $ns1 0 1
-		pm_nl_set_limits $ns2 0 1
-		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-	fi
-
-	# checksum test 0 1
-	if reset_with_checksum 0 1; then
-		pm_nl_set_limits $ns1 0 1
-		pm_nl_set_limits $ns2 0 1
-		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-	fi
-
-	# checksum test 1 0
-	if reset_with_checksum 1 0; then
-		pm_nl_set_limits $ns1 0 1
-		pm_nl_set_limits $ns2 0 1
-		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-	fi
+	local checksum_enable
+	for checksum_enable in "0 0" "1 1" "0 1" "1 0"; do
+		# checksum test 0 0, 1 1, 0 1, 1 0
+		if reset_with_checksum ${checksum_enable}; then
+			pm_nl_set_limits $ns1 0 1
+			pm_nl_set_limits $ns2 0 1
+			run_tests $ns1 $ns2 10.0.1.1
+			chk_join_nr 0 0 0
+		fi
+	done
 }
 
 deny_join_id0_tests()
-- 
cgit v1.2.3


From 38dc0708bcc8d2ee9e0559a992fd6e91bf67e271 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 2 Sep 2024 12:46:02 +0200
Subject: selftests: mptcp: pm_nl_ctl: remove re-definition

'MPTCP_PM_NAME' is defined in 'linux/mptcp_pm.h', included in
'linux/mptcp.h', no need to re-define it.

'MPTCP_PM_EVENTS' is not defined in 'linux/mptcp.h', but
'MPTCP_PM_EV_GRP_NAME' is, with the same value. We can then use the
latter, and drop the other one.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-11-d3e0f3773b90@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
index 7ad5a59adff2..994a556f46c1 100644
--- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
+++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
@@ -19,12 +19,6 @@
 
 #include "linux/mptcp.h"
 
-#ifndef MPTCP_PM_NAME
-#define MPTCP_PM_NAME		"mptcp_pm"
-#endif
-#ifndef MPTCP_PM_EVENTS
-#define MPTCP_PM_EVENTS		"mptcp_pm_events"
-#endif
 #ifndef IPPROTO_MPTCP
 #define IPPROTO_MPTCP 262
 #endif
@@ -116,7 +110,7 @@ static int capture_events(int fd, int event_group)
 
 	if (setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
 		       &event_group, sizeof(event_group)) < 0)
-		error(1, errno, "could not join the " MPTCP_PM_EVENTS " mcast group");
+		error(1, errno, "could not join the " MPTCP_PM_EV_GRP_NAME " mcast group");
 
 	do {
 		FD_ZERO(&rfds);
@@ -288,7 +282,7 @@ static int genl_parse_getfamily(struct nlmsghdr *nlh, int *pm_family,
 					if (grp->rta_type == CTRL_ATTR_MCAST_GRP_ID)
 						*events_mcast_grp = *(__u32 *)RTA_DATA(grp);
 					else if (grp->rta_type == CTRL_ATTR_MCAST_GRP_NAME &&
-						 !strcmp(RTA_DATA(grp), MPTCP_PM_EVENTS))
+						 !strcmp(RTA_DATA(grp), MPTCP_PM_EV_GRP_NAME))
 						got_events_grp = 1;
 
 					grp = RTA_NEXT(grp, grp_len);
-- 
cgit v1.2.3


From 577a67662ff529f617981fe9692ff277b5756402 Mon Sep 17 00:00:00 2001
From: Li Ming <ming4.li@intel.com>
Date: Fri, 30 Aug 2024 06:13:06 +0000
Subject: cxl/pci: Rename cxl_setup_parent_dport() and cxl_dport_map_regs()

The name of cxl_setup_parent_dport() function is not clear, the function
is used to initialize AER and RAS capabilities on a dport, therefore,
rename the function to cxl_dport_init_ras_reporting(), it is easier for
user to understand what the function does. Besides, adjust the order of
the function parameters, the subject of cxl_dport_init_ras_reporting()
is a cxl dport, so a struct cxl_dport as the first parameter of the
function should be better.

cxl_dport_map_regs() is used to map CXL RAS capability on a cxl dport,
using cxl_dport_map_ras() as the function name.

Signed-off-by: Li Ming <ming4.li@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/20240830061308.2327065-1-ming4.li@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/pci.c        | 13 +++++++++----
 drivers/cxl/cxl.h             |  5 +++--
 drivers/cxl/mem.c             |  2 +-
 tools/testing/cxl/Kbuild      |  2 +-
 tools/testing/cxl/test/mock.c |  6 +++---
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 4725e37d90fb..cca078068a05 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -787,7 +787,7 @@ static void cxl_dport_map_rch_aer(struct cxl_dport *dport)
 	dport->regs.dport_aer = dport_aer;
 }
 
-static void cxl_dport_map_regs(struct cxl_dport *dport)
+static void cxl_dport_map_ras(struct cxl_dport *dport)
 {
 	struct cxl_register_map *map = &dport->reg_map;
 	struct device *dev = dport->dport_dev;
@@ -831,7 +831,12 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
 	}
 }
 
-void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport)
+/**
+ * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport
+ * @dport: the cxl_dport that needs to be initialized
+ * @host: host device for devm operations
+ */
+void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
 {
 	struct device *dport_dev = dport->dport_dev;
 
@@ -843,12 +848,12 @@ void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport)
 	}
 
 	dport->reg_map.host = host;
-	cxl_dport_map_regs(dport);
+	cxl_dport_map_ras(dport);
 
 	if (dport->rch)
 		cxl_disable_rch_root_ints(dport);
 }
-EXPORT_SYMBOL_NS_GPL(cxl_setup_parent_dport, CXL);
+EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, CXL);
 
 static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
 					  struct cxl_dport *dport)
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 84cf3b4d60a1..2890f7e2aad5 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -763,9 +763,10 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
 
 #ifdef CONFIG_PCIEAER_CXL
 void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport);
+void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host);
 #else
-static inline void cxl_setup_parent_dport(struct device *host,
-					  struct cxl_dport *dport) { }
+static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport,
+						struct device *host) { }
 #endif
 
 struct cxl_decoder *to_cxl_decoder(struct device *dev);
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index ae94018a01bd..a9fd5cd5a0d2 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -166,7 +166,7 @@ static int cxl_mem_probe(struct device *dev)
 	else
 		endpoint_parent = &parent_port->dev;
 
-	cxl_setup_parent_dport(dev, dport);
+	cxl_dport_init_ras_reporting(dport, dev);
 
 	scoped_guard(device, endpoint_parent) {
 		if (!endpoint_parent->driver) {
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 3d1ca9e38b1f..b1256fee3567 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -14,7 +14,7 @@ ldflags-y += --wrap=cxl_dvsec_rr_decode
 ldflags-y += --wrap=devm_cxl_add_rch_dport
 ldflags-y += --wrap=cxl_rcd_component_reg_phys
 ldflags-y += --wrap=cxl_endpoint_parse_cdat
-ldflags-y += --wrap=cxl_setup_parent_dport
+ldflags-y += --wrap=cxl_dport_init_ras_reporting
 
 DRIVERS := ../../../drivers
 CXL_SRC := $(DRIVERS)/cxl
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index d619672faa49..bbd7d938156d 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -299,17 +299,17 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port)
 }
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, CXL);
 
-void __wrap_cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport)
+void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
 {
 	int index;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
 
 	if (!ops || !ops->is_mock_port(dport->dport_dev))
-		cxl_setup_parent_dport(host, dport);
+		cxl_dport_init_ras_reporting(dport, host);
 
 	put_cxl_mock_ops(index);
 }
-EXPORT_SYMBOL_NS_GPL(__wrap_cxl_setup_parent_dport, CXL);
+EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, CXL);
 
 MODULE_LICENSE("GPL v2");
 MODULE_IMPORT_NS(ACPI);
-- 
cgit v1.2.3


From 2f4db2861013fcfb4be11d0dc176271ae566241c Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang@huawei.com>
Date: Mon, 19 Aug 2024 21:06:09 +0800
Subject: selftests/mm: remove unnecessary ia64 code and comment

IA64 has gone with commit cf8e8658100d ("arch: Remove Itanium (IA-64)
architecture"), so remove unnecessary ia64 special mm code and comment in
selftests too.

Link: https://lkml.kernel.org/r/20240819130609.3386195-1-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile           |  2 +-
 tools/testing/selftests/mm/hugepage-mmap.c    | 18 +-----------------
 tools/testing/selftests/mm/hugepage-shm.c     | 18 +-----------------
 tools/testing/selftests/mm/hugepage-vmemmap.c | 17 ++---------------
 tools/testing/selftests/mm/map_hugetlb.c      | 18 ++----------------
 tools/testing/selftests/mm/run_vmtests.sh     |  2 +-
 6 files changed, 8 insertions(+), 67 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index cfad627e8d94..0533c5ee0b3e 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -112,7 +112,7 @@ endif
 
 endif
 
-ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
+ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
 TEST_GEN_FILES += va_high_addr_switch
 TEST_GEN_FILES += virtual_address_range
 TEST_GEN_FILES += write_to_hugetlbfs
diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c
index 267eea2e0e0b..3b1b532f1cbb 100644
--- a/tools/testing/selftests/mm/hugepage-mmap.c
+++ b/tools/testing/selftests/mm/hugepage-mmap.c
@@ -8,13 +8,6 @@
  * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
  * example, the app is requesting memory of size 256MB that is backed by
  * huge pages.
- *
- * For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages.  That means that if one requires a fixed address, a huge page
- * aligned address starting with 0x800000... will be required.  If a fixed
- * address is not required, the kernel will select an address in the proper
- * range.
- * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
  */
 #define _GNU_SOURCE
 #include <stdlib.h>
@@ -27,15 +20,6 @@
 #define LENGTH (256UL*1024*1024)
 #define PROTECTION (PROT_READ | PROT_WRITE)
 
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_SHARED | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
-#define FLAGS (MAP_SHARED)
-#endif
-
 static void check_bytes(char *addr)
 {
 	ksft_print_msg("First hex is %x\n", *((unsigned int *)addr));
@@ -74,7 +58,7 @@ int main(void)
 	if (fd < 0)
 		ksft_exit_fail_msg("memfd_create() failed: %s\n", strerror(errno));
 
-	addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+	addr = mmap(NULL, LENGTH, PROTECTION, MAP_SHARED, fd, 0);
 	if (addr == MAP_FAILED) {
 		close(fd);
 		ksft_exit_fail_msg("mmap(): %s\n", strerror(errno));
diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c
index 478bb1e989e9..ef06260802b5 100644
--- a/tools/testing/selftests/mm/hugepage-shm.c
+++ b/tools/testing/selftests/mm/hugepage-shm.c
@@ -8,13 +8,6 @@
  * SHM_HUGETLB in the shmget system call to inform the kernel that it is
  * requesting huge pages.
  *
- * For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages.  That means that if one requires a fixed address, a huge page
- * aligned address starting with 0x800000... will be required.  If a fixed
- * address is not required, the kernel will select an address in the proper
- * range.
- * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
- *
  * Note: The default shared memory limit is quite low on many kernels,
  * you may need to increase it via:
  *
@@ -39,15 +32,6 @@
 
 #define dprintf(x)  printf(x)
 
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define SHMAT_FLAGS (SHM_RND)
-#else
-#define ADDR (void *)(0x0UL)
-#define SHMAT_FLAGS (0)
-#endif
-
 int main(void)
 {
 	int shmid;
@@ -61,7 +45,7 @@ int main(void)
 	}
 	printf("shmid: 0x%x\n", shmid);
 
-	shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+	shmaddr = shmat(shmid, NULL, 0);
 	if (shmaddr == (char *)-1) {
 		perror("Shared memory attach failure");
 		shmctl(shmid, IPC_RMID, NULL);
diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c
index 894d28c3dd47..df366a4d1b92 100644
--- a/tools/testing/selftests/mm/hugepage-vmemmap.c
+++ b/tools/testing/selftests/mm/hugepage-vmemmap.c
@@ -22,20 +22,6 @@
 #define PM_PFRAME_BITS		55
 #define PM_PFRAME_MASK		~((1UL << PM_PFRAME_BITS) - 1)
 
-/*
- * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
- * That means the addresses starting with 0x800000... will need to be
- * specified.  Specifying a fixed address is not required on ppc64, i386
- * or x86_64.
- */
-#ifdef __ia64__
-#define MAP_ADDR		(void *)(0x8000000000000000UL)
-#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
-#else
-#define MAP_ADDR		NULL
-#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
-#endif
-
 static size_t pagesize;
 static size_t maplength;
 
@@ -113,7 +99,8 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 
-	addr = mmap(MAP_ADDR, maplength, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
+	addr = mmap(NULL, maplength, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
 	if (addr == MAP_FAILED) {
 		perror("mmap");
 		exit(1);
diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
index a1f005a90a4f..b47399feab53 100644
--- a/tools/testing/selftests/mm/map_hugetlb.c
+++ b/tools/testing/selftests/mm/map_hugetlb.c
@@ -4,11 +4,6 @@
  * system call with MAP_HUGETLB flag.  Before running this program make
  * sure the administrator has allocated enough default sized huge pages
  * to cover the 256 MB allocation.
- *
- * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
- * That means the addresses starting with 0x800000... will need to be
- * specified.  Specifying a fixed address is not required on ppc64, i386
- * or x86_64.
  */
 #include <stdlib.h>
 #include <stdio.h>
@@ -21,15 +16,6 @@
 #define LENGTH (256UL*1024*1024)
 #define PROTECTION (PROT_READ | PROT_WRITE)
 
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
-#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
-#endif
-
 static void check_bytes(char *addr)
 {
 	ksft_print_msg("First hex is %x\n", *((unsigned int *)addr));
@@ -60,7 +46,7 @@ int main(int argc, char **argv)
 	void *addr;
 	size_t hugepage_size;
 	size_t length = LENGTH;
-	int flags = FLAGS;
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
 	int shift = 0;
 
 	hugepage_size = default_huge_page_size();
@@ -85,7 +71,7 @@ int main(int argc, char **argv)
 		ksft_print_msg("Default size hugepages\n");
 	ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
 
-	addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
+	addr = mmap(NULL, length, PROTECTION, flags, -1, 0);
 	if (addr == MAP_FAILED)
 		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 36045edb10de..c5797ad1d37b 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -189,7 +189,7 @@ else
 fi
 
 # filter 64bit architectures
-ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64"
+ARCH64STR="arm64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64"
 if [ -z "$ARCH" ]; then
 	ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/')
 fi
-- 
cgit v1.2.3


From 2e6d88e9d455fae9c4ac95c893362258f9540dfa Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Tue, 20 Aug 2024 17:49:17 +0800
Subject: selftests: mm: support shmem mTHP collapse testing

Add shmem mTHP collpase testing.  Similar to the anonymous page, users can
use the '-s' parameter to specify the shmem mTHP size for testing.

Link: https://lkml.kernel.org/r/fa44bfa20ca5b9fd6f9163a048f3d3c1e53cd0a8.1724140601.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/khugepaged.c   |  4 ++-
 tools/testing/selftests/mm/thp_settings.c | 46 +++++++++++++++++++++++++++----
 tools/testing/selftests/mm/thp_settings.h |  9 +++++-
 3 files changed, 51 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
index 829320a519e7..56d4480e8d3c 100644
--- a/tools/testing/selftests/mm/khugepaged.c
+++ b/tools/testing/selftests/mm/khugepaged.c
@@ -1095,7 +1095,7 @@ static void usage(void)
 	fprintf(stderr,	"\n\tSupported Options:\n");
 	fprintf(stderr,	"\t\t-h: This help message.\n");
 	fprintf(stderr,	"\t\t-s: mTHP size, expressed as page order.\n");
-	fprintf(stderr,	"\t\t    Defaults to 0. Use this size for anon allocations.\n");
+	fprintf(stderr,	"\t\t    Defaults to 0. Use this size for anon or shmem allocations.\n");
 	exit(1);
 }
 
@@ -1209,6 +1209,8 @@ int main(int argc, char **argv)
 	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
 	default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
 	default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
+	default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
+	default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;
 
 	save_settings();
 	thp_push_settings(&default_settings);
diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c
index a4163438108e..577eaab6266f 100644
--- a/tools/testing/selftests/mm/thp_settings.c
+++ b/tools/testing/selftests/mm/thp_settings.c
@@ -33,10 +33,11 @@ static const char * const thp_defrag_strings[] = {
 };
 
 static const char * const shmem_enabled_strings[] = {
+	"never",
 	"always",
 	"within_size",
 	"advise",
-	"never",
+	"inherit",
 	"deny",
 	"force",
 	NULL
@@ -200,6 +201,7 @@ void thp_write_num(const char *name, unsigned long num)
 void thp_read_settings(struct thp_settings *settings)
 {
 	unsigned long orders = thp_supported_orders();
+	unsigned long shmem_orders = thp_shmem_supported_orders();
 	char path[PATH_MAX];
 	int i;
 
@@ -234,12 +236,24 @@ void thp_read_settings(struct thp_settings *settings)
 		settings->hugepages[i].enabled =
 			thp_read_string(path, thp_enabled_strings);
 	}
+
+	for (i = 0; i < NR_ORDERS; i++) {
+		if (!((1 << i) & shmem_orders)) {
+			settings->shmem_hugepages[i].enabled = SHMEM_NEVER;
+			continue;
+		}
+		snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled",
+			(getpagesize() >> 10) << i);
+		settings->shmem_hugepages[i].enabled =
+			thp_read_string(path, shmem_enabled_strings);
+	}
 }
 
 void thp_write_settings(struct thp_settings *settings)
 {
 	struct khugepaged_settings *khugepaged = &settings->khugepaged;
 	unsigned long orders = thp_supported_orders();
+	unsigned long shmem_orders = thp_shmem_supported_orders();
 	char path[PATH_MAX];
 	int enabled;
 	int i;
@@ -271,6 +285,15 @@ void thp_write_settings(struct thp_settings *settings)
 		enabled = settings->hugepages[i].enabled;
 		thp_write_string(path, thp_enabled_strings[enabled]);
 	}
+
+	for (i = 0; i < NR_ORDERS; i++) {
+		if (!((1 << i) & shmem_orders))
+			continue;
+		snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled",
+			(getpagesize() >> 10) << i);
+		enabled = settings->shmem_hugepages[i].enabled;
+		thp_write_string(path, shmem_enabled_strings[enabled]);
+	}
 }
 
 struct thp_settings *thp_current_settings(void)
@@ -324,17 +347,18 @@ void thp_set_read_ahead_path(char *path)
 	dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0';
 }
 
-unsigned long thp_supported_orders(void)
+static unsigned long __thp_supported_orders(bool is_shmem)
 {
 	unsigned long orders = 0;
 	char path[PATH_MAX];
 	char buf[256];
-	int ret;
-	int i;
+	int ret, i;
+	char anon_dir[] = "enabled";
+	char shmem_dir[] = "shmem_enabled";
 
 	for (i = 0; i < NR_ORDERS; i++) {
-		ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/enabled",
-			(getpagesize() >> 10) << i);
+		ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/%s",
+			       (getpagesize() >> 10) << i, is_shmem ? shmem_dir : anon_dir);
 		if (ret >= PATH_MAX) {
 			printf("%s: Pathname is too long\n", __func__);
 			exit(EXIT_FAILURE);
@@ -347,3 +371,13 @@ unsigned long thp_supported_orders(void)
 
 	return orders;
 }
+
+unsigned long thp_supported_orders(void)
+{
+	return __thp_supported_orders(false);
+}
+
+unsigned long thp_shmem_supported_orders(void)
+{
+	return __thp_supported_orders(true);
+}
diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h
index 71cbff05f4c7..876235a23460 100644
--- a/tools/testing/selftests/mm/thp_settings.h
+++ b/tools/testing/selftests/mm/thp_settings.h
@@ -22,10 +22,11 @@ enum thp_defrag {
 };
 
 enum shmem_enabled {
+	SHMEM_NEVER,
 	SHMEM_ALWAYS,
 	SHMEM_WITHIN_SIZE,
 	SHMEM_ADVISE,
-	SHMEM_NEVER,
+	SHMEM_INHERIT,
 	SHMEM_DENY,
 	SHMEM_FORCE,
 };
@@ -46,6 +47,10 @@ struct khugepaged_settings {
 	unsigned long pages_to_scan;
 };
 
+struct shmem_hugepages_settings {
+	enum shmem_enabled enabled;
+};
+
 struct thp_settings {
 	enum thp_enabled thp_enabled;
 	enum thp_defrag thp_defrag;
@@ -54,6 +59,7 @@ struct thp_settings {
 	struct khugepaged_settings khugepaged;
 	unsigned long read_ahead_kb;
 	struct hugepages_settings hugepages[NR_ORDERS];
+	struct shmem_hugepages_settings shmem_hugepages[NR_ORDERS];
 };
 
 int read_file(const char *path, char *buf, size_t buflen);
@@ -76,5 +82,6 @@ void thp_save_settings(void);
 
 void thp_set_read_ahead_path(char *path);
 unsigned long thp_supported_orders(void);
+unsigned long thp_shmem_supported_orders(void);
 
 #endif /* __THP_SETTINGS_H__ */
-- 
cgit v1.2.3


From f28bdd1b17ec187eaa34845814afaaff99832762 Mon Sep 17 00:00:00 2001
From: Pedro Falcato <pedro.falcato@gmail.com>
Date: Sat, 17 Aug 2024 01:18:34 +0100
Subject: selftests/mm: add more mseal traversal tests

Add more mseal traversal tests across VMAs, where we could possibly screw
up sealing checks.  These test more across-vma traversal for mprotect,
munmap and madvise.  Particularly, we test for the case where a regular
VMA is followed by a sealed VMA.

[akpm@linux-foundation.org: remove incorrect comment, per review]
[akpm@linux-foundation.org: remove the correct comment, per Pedro]
[pedro.falcato@gmail.com: fix mseal's length]
  Link: https://lkml.kernel.org/r/vc4czyuemmu3kylqb4ctaga6y5yvondlyabimx6jvljlw2fkea@djawlllf45xa
Link: https://lkml.kernel.org/r/20240817-mseal-depessimize-v3-7-d8d2e037df30@gmail.com
Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mseal_test.c | 106 +++++++++++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c
index bd0bfda7aae7..e7ed720e9677 100644
--- a/tools/testing/selftests/mm/mseal_test.c
+++ b/tools/testing/selftests/mm/mseal_test.c
@@ -777,6 +777,42 @@ static void test_seal_mprotect_partial_mprotect(bool seal)
 	REPORT_TEST_PASS();
 }
 
+static void test_seal_mprotect_partial_mprotect_tail(bool seal)
+{
+	void *ptr;
+	unsigned long page_size = getpagesize();
+	unsigned long size = 2 * page_size;
+	int ret;
+	int prot;
+
+	/*
+	 * Check if a partial mseal (that results in two vmas) works correctly.
+	 * It might mprotect the first, but it'll never touch the second (msealed) vma.
+	 */
+
+	setup_single_address(size, &ptr);
+	FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+	if (seal) {
+		ret = sys_mseal(ptr + page_size, page_size);
+		FAIL_TEST_IF_FALSE(!ret);
+	}
+
+	ret = sys_mprotect(ptr, size, PROT_EXEC);
+	if (seal)
+		FAIL_TEST_IF_FALSE(ret < 0);
+	else
+		FAIL_TEST_IF_FALSE(!ret);
+
+	if (seal) {
+		FAIL_TEST_IF_FALSE(get_vma_size(ptr + page_size, &prot) > 0);
+		FAIL_TEST_IF_FALSE(prot == 0x4);
+	}
+
+	REPORT_TEST_PASS();
+}
+
+
 static void test_seal_mprotect_two_vma_with_gap(bool seal)
 {
 	void *ptr;
@@ -994,6 +1030,36 @@ static void test_seal_munmap_vma_with_gap(bool seal)
 	REPORT_TEST_PASS();
 }
 
+static void test_seal_munmap_partial_across_vmas(bool seal)
+{
+	void *ptr;
+	unsigned long page_size = getpagesize();
+	unsigned long size = 2 * page_size;
+	int ret;
+	int prot;
+
+	setup_single_address(size, &ptr);
+	FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+	if (seal) {
+		ret = sys_mseal(ptr + page_size, page_size);
+		FAIL_TEST_IF_FALSE(!ret);
+	}
+
+	ret = sys_munmap(ptr, size);
+	if (seal)
+		FAIL_TEST_IF_FALSE(ret < 0);
+	else
+		FAIL_TEST_IF_FALSE(!ret);
+
+	if (seal) {
+		FAIL_TEST_IF_FALSE(get_vma_size(ptr + page_size, &prot) > 0);
+		FAIL_TEST_IF_FALSE(prot == 0x4);
+	}
+
+	REPORT_TEST_PASS();
+}
+
 static void test_munmap_start_freed(bool seal)
 {
 	void *ptr;
@@ -1746,6 +1812,37 @@ static void test_seal_discard_ro_anon(bool seal)
 	REPORT_TEST_PASS();
 }
 
+static void test_seal_discard_across_vmas(bool seal)
+{
+	void *ptr;
+	unsigned long page_size = getpagesize();
+	unsigned long size = 2 * page_size;
+	int ret;
+
+	setup_single_address(size, &ptr);
+	FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+	if (seal) {
+		ret = seal_single_address(ptr + page_size, page_size);
+		FAIL_TEST_IF_FALSE(!ret);
+	}
+
+	ret = sys_madvise(ptr, size, MADV_DONTNEED);
+	if (seal)
+		FAIL_TEST_IF_FALSE(ret < 0);
+	else
+		FAIL_TEST_IF_FALSE(!ret);
+
+	ret = sys_munmap(ptr, size);
+	if (seal)
+		FAIL_TEST_IF_FALSE(ret < 0);
+	else
+		FAIL_TEST_IF_FALSE(!ret);
+
+	REPORT_TEST_PASS();
+}
+
+
 static void test_seal_madvise_nodiscard(bool seal)
 {
 	void *ptr;
@@ -1790,7 +1887,7 @@ int main(int argc, char **argv)
 	if (!pkey_supported())
 		ksft_print_msg("PKEY not supported\n");
 
-	ksft_set_plan(82);
+	ksft_set_plan(88);
 
 	test_seal_addseal();
 	test_seal_unmapped_start();
@@ -1836,12 +1933,17 @@ int main(int argc, char **argv)
 	test_seal_mprotect_split(false);
 	test_seal_mprotect_split(true);
 
+	test_seal_mprotect_partial_mprotect_tail(false);
+	test_seal_mprotect_partial_mprotect_tail(true);
+
 	test_seal_munmap(false);
 	test_seal_munmap(true);
 	test_seal_munmap_two_vma(false);
 	test_seal_munmap_two_vma(true);
 	test_seal_munmap_vma_with_gap(false);
 	test_seal_munmap_vma_with_gap(true);
+	test_seal_munmap_partial_across_vmas(false);
+	test_seal_munmap_partial_across_vmas(true);
 
 	test_munmap_start_freed(false);
 	test_munmap_start_freed(true);
@@ -1873,6 +1975,8 @@ int main(int argc, char **argv)
 	test_seal_madvise_nodiscard(true);
 	test_seal_discard_ro_anon(false);
 	test_seal_discard_ro_anon(true);
+	test_seal_discard_across_vmas(false);
+	test_seal_discard_across_vmas(true);
 	test_seal_discard_ro_anon_on_rw(false);
 	test_seal_discard_ro_anon_on_rw(true);
 	test_seal_discard_ro_anon_on_shared(false);
-- 
cgit v1.2.3


From 02e1960aafac33721401dcd92e915325fdb524b2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:42 +0100
Subject: mm: rename PG_mappedtodisk to PG_owner_2

This flag has similar constraints to PG_owner_priv_1 -- it is ignored by
core code, and is entirely for the use of the code which allocated the
folio.  Since the pagecache does not use it, individual filesystems can
use it.  The bufferhead code does use it, so filesystems which use the
buffer cache must not use it for another purpose.

Link: https://lkml.kernel.org/r/20240821193445.2294269-10-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/page.c                    |  2 +-
 include/linux/kernel-page-flags.h |  2 +-
 include/linux/page-flags.h        | 24 ++++++++++++++++--------
 include/trace/events/mmflags.h    |  2 +-
 tools/mm/page-types.c             | 10 +++++-----
 5 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 73a0f872d97f..e74e639893be 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -211,7 +211,7 @@ u64 stable_page_flags(const struct page *page)
 #endif
 
 	u |= kpf_copy_bit(k, KPF_RESERVED,	PG_reserved);
-	u |= kpf_copy_bit(k, KPF_MAPPEDTODISK,	PG_mappedtodisk);
+	u |= kpf_copy_bit(k, KPF_OWNER_2,	PG_owner_2);
 	u |= kpf_copy_bit(k, KPF_PRIVATE,	PG_private);
 	u |= kpf_copy_bit(k, KPF_PRIVATE_2,	PG_private_2);
 	u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE,	PG_owner_priv_1);
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index 859f4b0c1b2b..7c587a711be1 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -10,7 +10,7 @@
  */
 #define KPF_RESERVED		32
 #define KPF_MLOCKED		33
-#define KPF_MAPPEDTODISK	34
+#define KPF_OWNER_2		34
 #define KPF_PRIVATE		35
 #define KPF_PRIVATE_2		36
 #define KPF_OWNER_PRIVATE	37
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 017205048cab..1ff3d172c22c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -101,12 +101,12 @@ enum pageflags {
 	PG_waiters,		/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
 	PG_active,
 	PG_workingset,
-	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
+	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use */
+	PG_owner_2,		/* Owner use. If pagecache, fs may use */
 	PG_arch_1,
 	PG_reserved,
 	PG_private,		/* If pagecache, has fs-private data */
 	PG_private_2,		/* If pagecache, has fs aux data */
-	PG_mappedtodisk,	/* Has blocks allocated on-disk */
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 	PG_unevictable,		/* Page is "unevictable"  */
@@ -131,6 +131,11 @@ enum pageflags {
 
 	PG_readahead = PG_reclaim,
 
+	/* Anonymous memory (and shmem) */
+	PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
+	/* Some filesystems */
+	PG_checked = PG_owner_priv_1,
+
 	/*
 	 * Depending on the way an anonymous folio can be mapped into a page
 	 * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
@@ -138,13 +143,13 @@ enum pageflags {
 	 * tail pages of an anonymous folio. For now, we only expect it to be
 	 * set on tail pages for PTE-mapped THP.
 	 */
-	PG_anon_exclusive = PG_mappedtodisk,
-
-	/* Filesystems */
-	PG_checked = PG_owner_priv_1,
+	PG_anon_exclusive = PG_owner_2,
 
-	/* SwapBacked */
-	PG_swapcache = PG_owner_priv_1,	/* Swap page: swp_entry_t in private */
+	/*
+	 * Set if all buffer heads in the folio are mapped.
+	 * Filesystems which do not use BHs can use it for their own purpose.
+	 */
+	PG_mappedtodisk = PG_owner_2,
 
 	/* Two page bits are conscripted by FS-Cache to maintain local caching
 	 * state.  These bits are set on pages belonging to the netfs's inodes
@@ -540,6 +545,9 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
 PAGEFLAG(Private, private, PF_ANY)
 PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
 
+/* owner_2 can be set on tail pages for anon memory */
+FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)
+
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index c151cc21d367..3b51558cdc9b 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -107,13 +107,13 @@
 	DEF_PAGEFLAG_NAME(active),					\
 	DEF_PAGEFLAG_NAME(workingset),					\
 	DEF_PAGEFLAG_NAME(owner_priv_1),				\
+	DEF_PAGEFLAG_NAME(owner_2),					\
 	DEF_PAGEFLAG_NAME(arch_1),					\
 	DEF_PAGEFLAG_NAME(reserved),					\
 	DEF_PAGEFLAG_NAME(private),					\
 	DEF_PAGEFLAG_NAME(private_2),					\
 	DEF_PAGEFLAG_NAME(writeback),					\
 	DEF_PAGEFLAG_NAME(head),					\
-	DEF_PAGEFLAG_NAME(mappedtodisk),				\
 	DEF_PAGEFLAG_NAME(reclaim),					\
 	DEF_PAGEFLAG_NAME(swapbacked),					\
 	DEF_PAGEFLAG_NAME(unevictable)					\
diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
index 8d5595b6c59f..8ca41c41105e 100644
--- a/tools/mm/page-types.c
+++ b/tools/mm/page-types.c
@@ -71,7 +71,7 @@
 /* [32-] kernel hacking assistances */
 #define KPF_RESERVED		32
 #define KPF_MLOCKED		33
-#define KPF_MAPPEDTODISK	34
+#define KPF_OWNER_2		34
 #define KPF_PRIVATE		35
 #define KPF_PRIVATE_2		36
 #define KPF_OWNER_PRIVATE	37
@@ -129,7 +129,7 @@ static const char * const page_flag_names[] = {
 
 	[KPF_RESERVED]		= "r:reserved",
 	[KPF_MLOCKED]		= "m:mlocked",
-	[KPF_MAPPEDTODISK]	= "d:mappedtodisk",
+	[KPF_OWNER_2]		= "d:owner_2",
 	[KPF_PRIVATE]		= "P:private",
 	[KPF_PRIVATE_2]		= "p:private_2",
 	[KPF_OWNER_PRIVATE]	= "O:owner_private",
@@ -472,9 +472,9 @@ static int bit_mask_ok(uint64_t flags)
 
 static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
 {
-	/* Anonymous pages overload PG_mappedtodisk */
-	if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK)))
-		flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE);
+	/* Anonymous pages use PG_owner_2 for anon_exclusive */
+	if ((flags & BIT(ANON)) && (flags & BIT(OWNER_2)))
+		flags ^= BIT(OWNER_2) | BIT(ANON_EXCLUSIVE);
 
 	/* SLUB overloads several page flags */
 	if (flags & BIT(SLAB)) {
-- 
cgit v1.2.3


From 7a87225ae2c6c317c7b80cf599e5cf0eee699196 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:43 +0100
Subject: x86: remove PG_uncached

Convert x86 to use PG_arch_2 instead of PG_uncached and remove
PG_uncached.

Link: https://lkml.kernel.org/r/20240821193445.2294269-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../features/vm/PG_uncached/arch-support.txt       | 30 ----------------------
 arch/arm64/Kconfig                                 |  3 ++-
 arch/x86/Kconfig                                   |  5 +---
 arch/x86/mm/pat/memtype.c                          |  8 +++---
 fs/proc/page.c                                     |  8 +++---
 include/linux/kernel-page-flags.h                  |  1 -
 include/linux/page-flags.h                         | 13 +++-------
 include/trace/events/mmflags.h                     | 23 ++++++++---------
 mm/Kconfig                                         |  9 +++----
 mm/huge_memory.c                                   |  4 ++-
 tools/mm/page-types.c                              |  3 +--
 11 files changed, 31 insertions(+), 76 deletions(-)
 delete mode 100644 Documentation/features/vm/PG_uncached/arch-support.txt

(limited to 'tools')

diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt
deleted file mode 100644
index 5a7508b8c967..000000000000
--- a/Documentation/features/vm/PG_uncached/arch-support.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# Feature name:          PG_uncached
-#         Kconfig:       ARCH_USES_PG_UNCACHED
-#         description:   arch supports the PG_uncached page flag
-#
-    -----------------------
-    |         arch |status|
-    -----------------------
-    |       alpha: | TODO |
-    |         arc: | TODO |
-    |         arm: | TODO |
-    |       arm64: | TODO |
-    |        csky: | TODO |
-    |     hexagon: | TODO |
-    |   loongarch: | TODO |
-    |        m68k: | TODO |
-    |  microblaze: | TODO |
-    |        mips: | TODO |
-    |       nios2: | TODO |
-    |    openrisc: | TODO |
-    |      parisc: | TODO |
-    |     powerpc: | TODO |
-    |       riscv: | TODO |
-    |        s390: | TODO |
-    |          sh: | TODO |
-    |       sparc: | TODO |
-    |          um: | TODO |
-    |         x86: |  ok  |
-    |      xtensa: | TODO |
-    -----------------------
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2f8ff354ca6..6494848019a0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2100,7 +2100,8 @@ config ARM64_MTE
 	depends on ARM64_PAN
 	select ARCH_HAS_SUBPAGE_FAULTS
 	select ARCH_USES_HIGH_VMA_FLAGS
-	select ARCH_USES_PG_ARCH_X
+	select ARCH_USES_PG_ARCH_2
+	select ARCH_USES_PG_ARCH_3
 	help
 	  Memory Tagging (part of the ARMv8.5 Extensions) provides
 	  architectural support for run-time, always-on detection of
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index acd9745bf2ae..b74b9ee484da 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1799,6 +1799,7 @@ config X86_PAT
 	def_bool y
 	prompt "x86 PAT support" if EXPERT
 	depends on MTRR
+	select ARCH_USES_PG_ARCH_2
 	help
 	  Use PAT attributes to setup page level cache control.
 
@@ -1810,10 +1811,6 @@ config X86_PAT
 
 	  If unsure, say Y.
 
-config ARCH_USES_PG_UNCACHED
-	def_bool y
-	depends on X86_PAT
-
 config X86_UMIP
 	def_bool y
 	prompt "User Mode Instruction Prevention" if EXPERT
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index bdc2a240c2aa..1fa0bf6ed295 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -104,7 +104,7 @@ __setup("debugpat", pat_debug_setup);
 
 #ifdef CONFIG_X86_PAT
 /*
- * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * X86 PAT uses page flags arch_1 and arch_2 together to keep track of
  * memory type of pages that have backing page struct.
  *
  * X86 PAT supports 4 different memory types:
@@ -118,9 +118,9 @@ __setup("debugpat", pat_debug_setup);
 
 #define _PGMT_WB		0
 #define _PGMT_WC		(1UL << PG_arch_1)
-#define _PGMT_UC_MINUS		(1UL << PG_uncached)
-#define _PGMT_WT		(1UL << PG_uncached | 1UL << PG_arch_1)
-#define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_UC_MINUS		(1UL << PG_arch_2)
+#define _PGMT_WT		(1UL << PG_arch_2 | 1UL << PG_arch_1)
+#define _PGMT_MASK		(1UL << PG_arch_2 | 1UL << PG_arch_1)
 #define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
 
 static inline enum page_cache_mode get_page_memtype(struct page *pg)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e74e639893be..a55f5acefa97 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -206,18 +206,16 @@ u64 stable_page_flags(const struct page *page)
 		u |= kpf_copy_bit(page->flags, KPF_HWPOISON,	PG_hwpoison);
 #endif
 
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
-#endif
-
 	u |= kpf_copy_bit(k, KPF_RESERVED,	PG_reserved);
 	u |= kpf_copy_bit(k, KPF_OWNER_2,	PG_owner_2);
 	u |= kpf_copy_bit(k, KPF_PRIVATE,	PG_private);
 	u |= kpf_copy_bit(k, KPF_PRIVATE_2,	PG_private_2);
 	u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE,	PG_owner_priv_1);
 	u |= kpf_copy_bit(k, KPF_ARCH,		PG_arch_1);
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
 	u |= kpf_copy_bit(k, KPF_ARCH_2,	PG_arch_2);
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
 	u |= kpf_copy_bit(k, KPF_ARCH_3,	PG_arch_3);
 #endif
 
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index 7c587a711be1..196778a087c4 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -15,7 +15,6 @@
 #define KPF_PRIVATE_2		36
 #define KPF_OWNER_PRIVATE	37
 #define KPF_ARCH		38
-#define KPF_UNCACHED		39
 #define KPF_SOFTDIRTY		40
 #define KPF_ARCH_2		41
 #define KPF_ARCH_3		42
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1ff3d172c22c..2175ebceb41c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -113,9 +113,6 @@ enum pageflags {
 #ifdef CONFIG_MMU
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-	PG_uncached,		/* Page has been mapped as uncached */
-#endif
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
@@ -123,8 +120,10 @@ enum pageflags {
 	PG_young,
 	PG_idle,
 #endif
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
 	PG_arch_2,
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
 	PG_arch_3,
 #endif
 	__NR_PAGEFLAGS,
@@ -602,12 +601,6 @@ FOLIO_FLAG_FALSE(mlocked)
 	FOLIO_TEST_SET_FLAG_FALSE(mlocked)
 #endif
 
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
-#else
-PAGEFLAG_FALSE(Uncached, uncached)
-#endif
-
 #ifdef CONFIG_MEMORY_FAILURE
 PAGEFLAG(HWPoison, hwpoison, PF_ANY)
 TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 3b51558cdc9b..58f2699331b6 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -71,12 +71,6 @@
 #define IF_HAVE_PG_MLOCK(_name)
 #endif
 
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-#define IF_HAVE_PG_UNCACHED(_name) ,{1UL << PG_##_name, __stringify(_name)}
-#else
-#define IF_HAVE_PG_UNCACHED(_name)
-#endif
-
 #ifdef CONFIG_MEMORY_FAILURE
 #define IF_HAVE_PG_HWPOISON(_name) ,{1UL << PG_##_name, __stringify(_name)}
 #else
@@ -89,10 +83,16 @@
 #define IF_HAVE_PG_IDLE(_name)
 #endif
 
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
-#define IF_HAVE_PG_ARCH_X(_name) ,{1UL << PG_##_name, __stringify(_name)}
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
+#define IF_HAVE_PG_ARCH_2(_name) ,{1UL << PG_##_name, __stringify(_name)}
+#else
+#define IF_HAVE_PG_ARCH_2(_name)
+#endif
+
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
+#define IF_HAVE_PG_ARCH_3(_name) ,{1UL << PG_##_name, __stringify(_name)}
 #else
-#define IF_HAVE_PG_ARCH_X(_name)
+#define IF_HAVE_PG_ARCH_3(_name)
 #endif
 
 #define DEF_PAGEFLAG_NAME(_name) { 1UL <<  PG_##_name, __stringify(_name) }
@@ -118,12 +118,11 @@
 	DEF_PAGEFLAG_NAME(swapbacked),					\
 	DEF_PAGEFLAG_NAME(unevictable)					\
 IF_HAVE_PG_MLOCK(mlocked)						\
-IF_HAVE_PG_UNCACHED(uncached)						\
 IF_HAVE_PG_HWPOISON(hwpoison)						\
 IF_HAVE_PG_IDLE(idle)							\
 IF_HAVE_PG_IDLE(young)							\
-IF_HAVE_PG_ARCH_X(arch_2)						\
-IF_HAVE_PG_ARCH_X(arch_3)
+IF_HAVE_PG_ARCH_2(arch_2)						\
+IF_HAVE_PG_ARCH_3(arch_3)
 
 #define show_page_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",				\
diff --git a/mm/Kconfig b/mm/Kconfig
index 5946dcdcaeda..8078a4b3c509 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1079,13 +1079,10 @@ config ARCH_USES_HIGH_VMA_FLAGS
 config ARCH_HAS_PKEYS
 	bool
 
-config ARCH_USES_PG_ARCH_X
+config ARCH_USES_PG_ARCH_2
+	bool
+config ARCH_USES_PG_ARCH_3
 	bool
-	help
-	  Enable the definition of PG_arch_x page flags with x > 1. Only
-	  suitable for 64-bit architectures with CONFIG_FLATMEM or
-	  CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be
-	  enough room for additional bits in page->flags.
 
 config VM_EVENT_COUNTERS
 	default y
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8dfb912ebdaa..c4b45ad576f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2993,8 +2993,10 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
 			 (1L << PG_workingset) |
 			 (1L << PG_locked) |
 			 (1L << PG_unevictable) |
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
 			 (1L << PG_arch_2) |
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
 			 (1L << PG_arch_3) |
 #endif
 			 (1L << PG_dirty) |
diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
index 8ca41c41105e..fa050d5a48cd 100644
--- a/tools/mm/page-types.c
+++ b/tools/mm/page-types.c
@@ -76,7 +76,7 @@
 #define KPF_PRIVATE_2		36
 #define KPF_OWNER_PRIVATE	37
 #define KPF_ARCH		38
-#define KPF_UNCACHED		39
+#define KPF_UNCACHED		39	/* unused */
 #define KPF_SOFTDIRTY		40
 #define KPF_ARCH_2		41
 
@@ -134,7 +134,6 @@ static const char * const page_flag_names[] = {
 	[KPF_PRIVATE_2]		= "p:private_2",
 	[KPF_OWNER_PRIVATE]	= "O:owner_private",
 	[KPF_ARCH]		= "h:arch",
-	[KPF_UNCACHED]		= "c:uncached",
 	[KPF_SOFTDIRTY]		= "f:softdirty",
 	[KPF_ARCH_2]		= "H:arch_2",
 
-- 
cgit v1.2.3


From c41a701d18efe6b8aa402efab16edbaba50c9548 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 21 Aug 2024 14:31:15 +0200
Subject: selftests/mm: fix charge_reserved_hugetlb.sh test

Currently, running the charge_reserved_hugetlb.sh selftest we can
sometimes observe something like:

  $ ./charge_reserved_hugetlb.sh -cgroup-v2
  ...
  write_result is 0
  After write:
  hugetlb_usage=0
  reserved_usage=10485760
  killing write_to_hugetlbfs
  Received 2.
  Deleting the memory
  Detach failure: Invalid argument
  umount: /mnt/huge: target is busy.

Both cases are issues in the test.

While the unmount error seems to be racy, it will make the test fail:
	$ ./run_vmtests.sh -t hugetlb
	...
	# [FAIL]
	not ok 10 charge_reserved_hugetlb.sh -cgroup-v2 # exit=32

The issue is that we are not waiting for the write_to_hugetlbfs process to
quit.  So it might still have a hugetlbfs file open, about which umount is
not happy.  Fix that by making "killall" wait for the process to quit.

The other error ("Detach failure: Invalid argument") does not seem to
result in a test error, but is misleading.  Turns out write_to_hugetlbfs.c
unconditionally tries to cleanup using shmdt(), even when we only
mmap()'ed a hugetlb file.  Even worse, shmaddr is never even set for the
SHM case.  Fix that as well.

With this change it seems to work as expected.

Link: https://lkml.kernel.org/r/20240821123115.2068812-1-david@redhat.com
Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: Mario Casquero <mcasquer@redhat.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../testing/selftests/mm/charge_reserved_hugetlb.sh |  2 +-
 tools/testing/selftests/mm/write_to_hugetlbfs.c     | 21 ++++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index d680c00d2853..67df7b47087f 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -254,7 +254,7 @@ function cleanup_hugetlb_memory() {
   local cgroup="$1"
   if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then
     echo killing write_to_hugetlbfs
-    killall -2 write_to_hugetlbfs
+    killall -2 --wait write_to_hugetlbfs
     wait_for_hugetlb_memory_to_get_depleted $cgroup
   fi
   set -e
diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c
index 6a2caba19ee1..1289d311efd7 100644
--- a/tools/testing/selftests/mm/write_to_hugetlbfs.c
+++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c
@@ -28,7 +28,7 @@ enum method {
 
 /* Global variables. */
 static const char *self;
-static char *shmaddr;
+static int *shmaddr;
 static int shmid;
 
 /*
@@ -47,15 +47,17 @@ void sig_handler(int signo)
 {
 	printf("Received %d.\n", signo);
 	if (signo == SIGINT) {
-		printf("Deleting the memory\n");
-		if (shmdt((const void *)shmaddr) != 0) {
-			perror("Detach failure");
+		if (shmaddr) {
+			printf("Deleting the memory\n");
+			if (shmdt((const void *)shmaddr) != 0) {
+				perror("Detach failure");
+				shmctl(shmid, IPC_RMID, NULL);
+				exit(4);
+			}
+
 			shmctl(shmid, IPC_RMID, NULL);
-			exit(4);
+			printf("Done deleting the memory\n");
 		}
-
-		shmctl(shmid, IPC_RMID, NULL);
-		printf("Done deleting the memory\n");
 	}
 	exit(2);
 }
@@ -211,7 +213,8 @@ int main(int argc, char **argv)
 			shmctl(shmid, IPC_RMID, NULL);
 			exit(2);
 		}
-		printf("shmaddr: %p\n", ptr);
+		shmaddr = ptr;
+		printf("shmaddr: %p\n", shmaddr);
 
 		break;
 	default:
-- 
cgit v1.2.3


From fd06ce2ce4c1f93e3d5d7d2640672ccb1a94cce3 Mon Sep 17 00:00:00 2001
From: Mike Yuan <me@yhndnzj.com>
Date: Fri, 23 Aug 2024 16:27:09 +0000
Subject: selftests: test_zswap: add test for hierarchical zswap.writeback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ensure that zswap.writeback check goes up the cgroup tree, i.e.  is
hierarchical.  Create a subcgroup which has zswap.writeback set to 1, and
the upper hierarchy's restrictions shall apply.

Link: https://lkml.kernel.org/r/20240823162506.12117-2-me@yhndnzj.com
Signed-off-by: Mike Yuan <me@yhndnzj.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/cgroup/test_zswap.c | 75 +++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index 190096017f80..40de679248b8 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -263,15 +263,13 @@ out:
 static int attempt_writeback(const char *cgroup, void *arg)
 {
 	long pagesize = sysconf(_SC_PAGESIZE);
-	char *test_group = arg;
 	size_t memsize = MB(4);
 	char buf[pagesize];
 	long zswap_usage;
-	bool wb_enabled;
+	bool wb_enabled = *(bool *) arg;
 	int ret = -1;
 	char *mem;
 
-	wb_enabled = cg_read_long(test_group, "memory.zswap.writeback");
 	mem = (char *)malloc(memsize);
 	if (!mem)
 		return ret;
@@ -288,12 +286,12 @@ static int attempt_writeback(const char *cgroup, void *arg)
 		memcpy(&mem[i], buf, pagesize);
 
 	/* Try and reclaim allocated memory */
-	if (cg_write_numeric(test_group, "memory.reclaim", memsize)) {
+	if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
 		ksft_print_msg("Failed to reclaim all of the requested memory\n");
 		goto out;
 	}
 
-	zswap_usage = cg_read_long(test_group, "memory.zswap.current");
+	zswap_usage = cg_read_long(cgroup, "memory.zswap.current");
 
 	/* zswpin */
 	for (int i = 0; i < memsize; i += pagesize) {
@@ -303,7 +301,7 @@ static int attempt_writeback(const char *cgroup, void *arg)
 		}
 	}
 
-	if (cg_write_numeric(test_group, "memory.zswap.max", zswap_usage/2))
+	if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2))
 		goto out;
 
 	/*
@@ -312,7 +310,7 @@ static int attempt_writeback(const char *cgroup, void *arg)
 	 * If writeback is disabled, memory reclaim will fail as zswap is limited and
 	 * it can't writeback to swap.
 	 */
-	ret = cg_write_numeric(test_group, "memory.reclaim", memsize);
+	ret = cg_write_numeric(cgroup, "memory.reclaim", memsize);
 	if (!wb_enabled)
 		ret = (ret == -EAGAIN) ? 0 : -1;
 
@@ -321,12 +319,41 @@ out:
 	return ret;
 }
 
+static int test_zswap_writeback_one(const char *cgroup, bool wb)
+{
+	long zswpwb_before, zswpwb_after;
+
+	zswpwb_before = get_cg_wb_count(cgroup);
+	if (zswpwb_before != 0) {
+		ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before);
+		return -1;
+	}
+
+	if (cg_run(cgroup, attempt_writeback, (void *) &wb))
+		return -1;
+
+	/* Verify that zswap writeback occurred only if writeback was enabled */
+	zswpwb_after = get_cg_wb_count(cgroup);
+	if (zswpwb_after < 0)
+		return -1;
+
+	if (wb != !!zswpwb_after) {
+		ksft_print_msg("zswpwb_after is %ld while wb is %s",
+				zswpwb_after, wb ? "enabled" : "disabled");
+		return -1;
+	}
+
+	return 0;
+}
+
 /* Test to verify the zswap writeback path */
 static int test_zswap_writeback(const char *root, bool wb)
 {
-	long zswpwb_before, zswpwb_after;
 	int ret = KSFT_FAIL;
-	char *test_group;
+	char *test_group, *test_group_child = NULL;
+
+	if (cg_read_strcmp(root, "memory.zswap.writeback", "1"))
+		return KSFT_SKIP;
 
 	test_group = cg_name(root, "zswap_writeback_test");
 	if (!test_group)
@@ -336,29 +363,35 @@ static int test_zswap_writeback(const char *root, bool wb)
 	if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0"))
 		goto out;
 
-	zswpwb_before = get_cg_wb_count(test_group);
-	if (zswpwb_before != 0) {
-		ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before);
+	if (test_zswap_writeback_one(test_group, wb))
 		goto out;
-	}
 
-	if (cg_run(test_group, attempt_writeback, (void *) test_group))
+	/* Reset memory.zswap.max to max (modified by attempt_writeback), and
+	 * set up child cgroup, whose memory.zswap.writeback is hardcoded to 1.
+	 * Thus, the parent's setting shall be what's in effect. */
+	if (cg_write(test_group, "memory.zswap.max", "max"))
+		goto out;
+	if (cg_write(test_group, "cgroup.subtree_control", "+memory"))
 		goto out;
 
-	/* Verify that zswap writeback occurred only if writeback was enabled */
-	zswpwb_after = get_cg_wb_count(test_group);
-	if (zswpwb_after < 0)
+	test_group_child = cg_name(test_group, "zswap_writeback_test_child");
+	if (!test_group_child)
+		goto out;
+	if (cg_create(test_group_child))
+		goto out;
+	if (cg_write(test_group_child, "memory.zswap.writeback", "1"))
 		goto out;
 
-	if (wb != !!zswpwb_after) {
-		ksft_print_msg("zswpwb_after is %ld while wb is %s",
-				zswpwb_after, wb ? "enabled" : "disabled");
+	if (test_zswap_writeback_one(test_group_child, wb))
 		goto out;
-	}
 
 	ret = KSFT_PASS;
 
 out:
+	if (test_group_child) {
+		cg_destroy(test_group_child);
+		free(test_group_child);
+	}
 	cg_destroy(test_group);
 	free(test_group);
 	return ret;
-- 
cgit v1.2.3


From 4e52a60ac5c079000add146ca39b0edcc30f74cd Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 30 Aug 2024 19:10:13 +0100
Subject: tools: improve vma test Makefile

Patch series "mm: remove vma_merge()", v3.

The infamous vma_merge() function has been the cause of a great deal of
pain, bugs and confusion for a very long time.

It is subtle, contains many corner cases, tries to do far too much and is
as a result very fragile.

The fact that the function requires there to be a numbering system to
cover each possible eventuality with references to each in the many
branches of its implementation as to which case you are looking at speaks
to all this.

Some of this complexity is inherent - unfortunately there is no getting
away from the need to figure out precisely how to execute the merge,
whether we need to remove VMAs, whether it is safe to do so, what
constitutes a mergeable VMA and so on.

However, a lot of the complexity is not inherent but instead a product of
the function's 'organic' development.

Liam has gone to great lengths to improve the situation as a part of his
maple tree implementation, greatly improving the readability of the code,
and Vlastimil and myself have additionally gone to lengths to try to
improve things further.

However, with the availability of userland VMA testing, it now becomes
possible to perform a rather more significant refactoring while
maintaining confidence in its correct operation.

An attempt was previously made by Vlastimil [0] to eliminate vma_merge(),
however it was rather - brutal - and an astute reader might refer to the
date of that patch for insight as to its intent.

This series instead divides merge operations into two natural kinds -
merges which occur when a NEW vma is being added to the address space, and
merges which occur when a vma is being MODIFIED.

Happily, the vma_expand() function introduced by Liam, which has the
capacity for also deleting a subsequent VMA, covers each of the NEW vma
cases.

By abstracting the actual final commit of changes to a VMA to its own
function, commit_merge() and writing a wrapper around vma_expand() for new
VMA cases vma_merge_new_range(), we can avoid having to use vma_merge()
for these instances altogether.

By doing so we are also able to then de-duplicate all existing merge logic
in mmap_region() and do_brk_flags() and have everything invoke this new
function, so we universally take the same approach to merging new VMAs.

Having done so, we can then completely rework vma_merge() into
vma_merge_existing_range() and use this for the instances where a merge is
proposed for a region of an existing VMA.

This eliminates vma_merge() and its numbered cases and instead divides
things into logical cases - merge both, merge left, merge right (the
latter 2 being either partial or full merges).

The code is heavily annotated with ASCII diagrams and greatly simplified
in comparison to the existing vma_merge() function.

Having made this change, we take the opportunity to address an issue with
merging VMAs possessing a vm_ops->close() hook - commit 714965ca8252
("mm/mmap: start distinguishing if vma can be removed in mergeability
test") and commit fc0c8f9089c2 ("mm, mmap: fix vma_merge() case 7 with
vma_ops->close") make efforts to relax how we handle these, making
assumptions about which VMAs might end up deleted (and thus, if possessing
a vm_ops->close() hook, cannot be).

This refactor means we do not need to guess, so instead explicitly only
disallow merge in instances where a VMA with a vm_ops->close() hook would
be deleted (and try a smaller merge in cases where this is possible).

In addition to these changes, we introduce a new vma_merge_struct
abstraction to allow VMA merge state to be threaded through the operation
neatly.

There is heavy unit testing provided for all merge functionality, added
prior to the refactoring, allowing for before/after testing.

The vm_ops->close() change also introduces exhaustive testing to
demonstrate that this functions as expected, and in addition to this the
reproduction code from commit fc0c8f9089c2 ("mm, mmap: fix vma_merge()
case 7 with vma_ops->close") was tested and confirmed passing.

[0]:https://lore.kernel.org/linux-mm/20240401192623.18575-2-vbabka@suse.cz/


This patch (of 10):

Have vma.o depend on its source dependencies explicitly, as previously
these were simply being ignored as existing object files were up to date.

This now correctly re-triggers the build if mm/ source is changed as well
as local source code.

Also set clean as a phony rule.

Link: https://lkml.kernel.org/r/cover.1725040657.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/e3ea58f08364ae5432c9a074de0195a7c7e0b04a.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
index bfc905d222cf..860fd2311dcc 100644
--- a/tools/testing/vma/Makefile
+++ b/tools/testing/vma/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
-.PHONY: default
+.PHONY: default clean
 
 default: vma
 
@@ -9,7 +9,9 @@ include ../shared/shared.mk
 OFILES = $(SHARED_OFILES) vma.o maple-shim.o
 TARGETS = vma
 
-vma:	$(OFILES) vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
+vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
+
+vma:	$(OFILES)
 	$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
 
 clean:
-- 
cgit v1.2.3


From 955db39676b6de84283b370d03683171b67dceb3 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 30 Aug 2024 19:10:14 +0100
Subject: tools: add VMA merge tests

Add a variety of VMA merge unit tests to assert that the behaviour of VMA
merge is correct at an abstract level and VMAs are merged or not merged as
expected.

These are intentionally added _before_ we start refactoring vma_merge() in
order that we can continually assert correctness throughout the rest of
the series.

In order to reduce churn going forward, we backport the vma_merge_struct
data type to the test code which we introduce and use in a future commit,
and add wrappers around the merge new and existing VMA cases.

Link: https://lkml.kernel.org/r/1c7a0b43cfad2c511a6b1b52f3507696478ff51a.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma.c          | 1282 +++++++++++++++++++++++++++++++++++++-
 tools/testing/vma/vma_internal.h |   45 +-
 2 files changed, 1317 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 48e033c60d87..71bd30d5da81 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -7,13 +7,43 @@
 #include "maple-shared.h"
 #include "vma_internal.h"
 
+/* Include so header guard set. */
+#include "../../../mm/vma.h"
+
+static bool fail_prealloc;
+
+/* Then override vma_iter_prealloc() so we can choose to fail it. */
+#define vma_iter_prealloc(vmi, vma)					\
+	(fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL))
+
 /*
  * Directly import the VMA implementation here. Our vma_internal.h wrapper
  * provides userland-equivalent functionality for everything vma.c uses.
  */
 #include "../../../mm/vma.c"
 
+/*
+ * Temporarily forward-ported from a future in which vmg's are used for merging.
+ */
+struct vma_merge_struct {
+	struct mm_struct *mm;
+	struct vma_iterator *vmi;
+	pgoff_t pgoff;
+	struct vm_area_struct *prev;
+	struct vm_area_struct *next; /* Modified by vma_merge(). */
+	struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
+	unsigned long start;
+	unsigned long end;
+	unsigned long flags;
+	struct file *file;
+	struct anon_vma *anon_vma;
+	struct mempolicy *policy;
+	struct vm_userfaultfd_ctx uffd_ctx;
+	struct anon_vma_name *anon_name;
+};
+
 const struct vm_operations_struct vma_dummy_vm_ops;
+static struct anon_vma dummy_anon_vma;
 
 #define ASSERT_TRUE(_expr)						\
 	do {								\
@@ -28,6 +58,14 @@ const struct vm_operations_struct vma_dummy_vm_ops;
 #define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2))
 #define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2))
 
+static struct task_struct __current;
+
+struct task_struct *get_current(void)
+{
+	return &__current;
+}
+
+/* Helper function to simply allocate a VMA. */
 static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
 					unsigned long start,
 					unsigned long end,
@@ -47,22 +85,201 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
 	return ret;
 }
 
+/* Helper function to allocate a VMA and link it to the tree. */
+static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
+						 unsigned long start,
+						 unsigned long end,
+						 pgoff_t pgoff,
+						 vm_flags_t flags)
+{
+	struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, flags);
+
+	if (vma == NULL)
+		return NULL;
+
+	if (vma_link(mm, vma)) {
+		vm_area_free(vma);
+		return NULL;
+	}
+
+	/*
+	 * Reset this counter which we use to track whether writes have
+	 * begun. Linking to the tree will have caused this to be incremented,
+	 * which means we will get a false positive otherwise.
+	 */
+	vma->vm_lock_seq = -1;
+
+	return vma;
+}
+
+/* Helper function which provides a wrapper around a merge new VMA operation. */
+static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
+{
+	/* vma_merge() needs a VMA to determine mm, anon_vma, and file. */
+	struct vm_area_struct dummy = {
+		.vm_mm = vmg->mm,
+		.vm_flags = vmg->flags,
+		.anon_vma = vmg->anon_vma,
+		.vm_file = vmg->file,
+	};
+
+	/*
+	 * For convenience, get prev and next VMAs. Which the new VMA operation
+	 * requires.
+	 */
+	vmg->next = vma_next(vmg->vmi);
+	vmg->prev = vma_prev(vmg->vmi);
+
+	vma_iter_set(vmg->vmi, vmg->start);
+	return vma_merge_new_vma(vmg->vmi, vmg->prev, &dummy, vmg->start,
+				 vmg->end, vmg->pgoff);
+}
+
+/*
+ * Helper function which provides a wrapper around a merge existing VMA
+ * operation.
+ */
+static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
+{
+	/* vma_merge() needs a VMA to determine mm, anon_vma, and file. */
+	struct vm_area_struct dummy = {
+		.vm_mm = vmg->mm,
+		.vm_flags = vmg->flags,
+		.anon_vma = vmg->anon_vma,
+		.vm_file = vmg->file,
+	};
+
+	return vma_merge(vmg->vmi, vmg->prev, &dummy, vmg->start, vmg->end,
+			 vmg->flags, vmg->pgoff, vmg->policy, vmg->uffd_ctx,
+			 vmg->anon_name);
+}
+
+/*
+ * Helper function which provides a wrapper around the expansion of an existing
+ * VMA.
+ */
+static int expand_existing(struct vma_merge_struct *vmg)
+{
+	return vma_expand(vmg->vmi, vmg->vma, vmg->start, vmg->end, vmg->pgoff,
+			  vmg->next);
+}
+
+/*
+ * Helper function to reset merge state the associated VMA iterator to a
+ * specified new range.
+ */
+static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
+			  unsigned long end, pgoff_t pgoff, vm_flags_t flags)
+{
+	vma_iter_set(vmg->vmi, start);
+
+	vmg->prev = NULL;
+	vmg->next = NULL;
+	vmg->vma = NULL;
+
+	vmg->start = start;
+	vmg->end = end;
+	vmg->pgoff = pgoff;
+	vmg->flags = flags;
+}
+
+/*
+ * Helper function to try to merge a new VMA.
+ *
+ * Update vmg and the iterator for it and try to merge, otherwise allocate a new
+ * VMA, link it to the maple tree and return it.
+ */
+static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
+						struct vma_merge_struct *vmg,
+						unsigned long start, unsigned long end,
+						pgoff_t pgoff, vm_flags_t flags,
+						bool *was_merged)
+{
+	struct vm_area_struct *merged;
+
+	vmg_set_range(vmg, start, end, pgoff, flags);
+
+	merged = merge_new(vmg);
+	if (merged) {
+		*was_merged = true;
+		return merged;
+	}
+
+	*was_merged = false;
+	return alloc_and_link_vma(mm, start, end, pgoff, flags);
+}
+
+/*
+ * Helper function to reset the dummy anon_vma to indicate it has not been
+ * duplicated.
+ */
+static void reset_dummy_anon_vma(void)
+{
+	dummy_anon_vma.was_cloned = false;
+	dummy_anon_vma.was_unlinked = false;
+}
+
+/*
+ * Helper function to remove all VMAs and destroy the maple tree associated with
+ * a virtual address space. Returns a count of VMAs in the tree.
+ */
+static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
+{
+	struct vm_area_struct *vma;
+	int count = 0;
+
+	fail_prealloc = false;
+	reset_dummy_anon_vma();
+
+	vma_iter_set(vmi, 0);
+	for_each_vma(*vmi, vma) {
+		vm_area_free(vma);
+		count++;
+	}
+
+	mtree_destroy(&mm->mm_mt);
+	mm->map_count = 0;
+	return count;
+}
+
+/* Helper function to determine if VMA has had vma_start_write() performed. */
+static bool vma_write_started(struct vm_area_struct *vma)
+{
+	int seq = vma->vm_lock_seq;
+
+	/* We reset after each check. */
+	vma->vm_lock_seq = -1;
+
+	/* The vma_start_write() stub simply increments this value. */
+	return seq > -1;
+}
+
+/* Helper function providing a dummy vm_ops->close() method.*/
+static void dummy_close(struct vm_area_struct *)
+{
+}
+
 static bool test_simple_merge(void)
 {
 	struct vm_area_struct *vma;
 	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, flags);
-	struct vm_area_struct *vma_middle = alloc_vma(&mm, 0x1000, 0x2000, 1, flags);
 	struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, flags);
 	VMA_ITERATOR(vmi, &mm, 0x1000);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+		.start = 0x1000,
+		.end = 0x2000,
+		.flags = flags,
+		.pgoff = 1,
+	};
 
 	ASSERT_FALSE(vma_link(&mm, vma_left));
-	ASSERT_FALSE(vma_link(&mm, vma_middle));
 	ASSERT_FALSE(vma_link(&mm, vma_right));
 
-	vma = vma_merge_new_vma(&vmi, vma_left, vma_middle, 0x1000,
-				0x2000, 1);
+	vma = merge_new(&vmg);
 	ASSERT_NE(vma, NULL);
 
 	ASSERT_EQ(vma->vm_start, 0);
@@ -142,10 +359,17 @@ static bool test_simple_expand(void)
 	struct mm_struct mm = {};
 	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags);
 	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.vmi = &vmi,
+		.vma = vma,
+		.start = 0,
+		.end = 0x3000,
+		.pgoff = 0,
+	};
 
 	ASSERT_FALSE(vma_link(&mm, vma));
 
-	ASSERT_FALSE(vma_expand(&vmi, vma, 0, 0x3000, 0, NULL));
+	ASSERT_FALSE(expand_existing(&vmg));
 
 	ASSERT_EQ(vma->vm_start, 0);
 	ASSERT_EQ(vma->vm_end, 0x3000);
@@ -178,6 +402,1042 @@ static bool test_simple_shrink(void)
 	return true;
 }
 
+static bool test_merge_new(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_a = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_b = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_c = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_d = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	int count;
+	struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
+	bool merged;
+
+	/*
+	 * 0123456789abc
+	 * AA B       CC
+	 */
+	vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
+	ASSERT_NE(vma_a, NULL);
+	/* We give each VMA a single avc so we can test anon_vma duplication. */
+	INIT_LIST_HEAD(&vma_a->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain);
+
+	vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags);
+	ASSERT_NE(vma_b, NULL);
+	INIT_LIST_HEAD(&vma_b->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain);
+
+	vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, flags);
+	ASSERT_NE(vma_c, NULL);
+	INIT_LIST_HEAD(&vma_c->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain);
+
+	/*
+	 * NO merge.
+	 *
+	 * 0123456789abc
+	 * AA B   **  CC
+	 */
+	vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, flags, &merged);
+	ASSERT_NE(vma_d, NULL);
+	INIT_LIST_HEAD(&vma_d->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain);
+	ASSERT_FALSE(merged);
+	ASSERT_EQ(mm.map_count, 4);
+
+	/*
+	 * Merge BOTH sides.
+	 *
+	 * 0123456789abc
+	 * AA*B   DD  CC
+	 */
+	vma_b->anon_vma = &dummy_anon_vma;
+	vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged);
+	ASSERT_EQ(vma, vma_a);
+	/* Merge with A, delete B. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x4000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 3);
+
+	/*
+	 * Merge to PREVIOUS VMA.
+	 *
+	 * 0123456789abc
+	 * AAAA*  DD  CC
+	 */
+	vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, flags, &merged);
+	ASSERT_EQ(vma, vma_a);
+	/* Extend A. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x5000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 3);
+
+	/*
+	 * Merge to NEXT VMA.
+	 *
+	 * 0123456789abc
+	 * AAAAA *DD  CC
+	 */
+	vma_d->anon_vma = &dummy_anon_vma;
+	vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged);
+	ASSERT_EQ(vma, vma_d);
+	/* Prepend. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0x6000);
+	ASSERT_EQ(vma->vm_end, 0x9000);
+	ASSERT_EQ(vma->vm_pgoff, 6);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 3);
+
+	/*
+	 * Merge BOTH sides.
+	 *
+	 * 0123456789abc
+	 * AAAAA*DDD  CC
+	 */
+	vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged);
+	ASSERT_EQ(vma, vma_a);
+	/* Merge with A, delete D. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x9000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 2);
+
+	/*
+	 * Merge to NEXT VMA.
+	 *
+	 * 0123456789abc
+	 * AAAAAAAAA *CC
+	 */
+	vma_c->anon_vma = &dummy_anon_vma;
+	vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, flags, &merged);
+	ASSERT_EQ(vma, vma_c);
+	/* Prepend C. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0xa000);
+	ASSERT_EQ(vma->vm_end, 0xc000);
+	ASSERT_EQ(vma->vm_pgoff, 0xa);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 2);
+
+	/*
+	 * Merge BOTH sides.
+	 *
+	 * 0123456789abc
+	 * AAAAAAAAA*CCC
+	 */
+	vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, flags, &merged);
+	ASSERT_EQ(vma, vma_a);
+	/* Extend A and delete C. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0xc000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 1);
+
+	/*
+	 * Final state.
+	 *
+	 * 0123456789abc
+	 * AAAAAAAAAAAAA
+	 */
+
+	count = 0;
+	vma_iter_set(&vmi, 0);
+	for_each_vma(vmi, vma) {
+		ASSERT_NE(vma, NULL);
+		ASSERT_EQ(vma->vm_start, 0);
+		ASSERT_EQ(vma->vm_end, 0xc000);
+		ASSERT_EQ(vma->vm_pgoff, 0);
+		ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+
+		vm_area_free(vma);
+		count++;
+	}
+
+	/* Should only have one VMA left (though freed) after all is done.*/
+	ASSERT_EQ(count, 1);
+
+	mtree_destroy(&mm.mm_mt);
+	return true;
+}
+
+static bool test_vma_merge_special_flags(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	vm_flags_t special_flags[] = { VM_IO, VM_DONTEXPAND, VM_PFNMAP, VM_MIXEDMAP };
+	vm_flags_t all_special_flags = 0;
+	int i;
+	struct vm_area_struct *vma_left, *vma;
+
+	/* Make sure there aren't new VM_SPECIAL flags. */
+	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+		all_special_flags |= special_flags[i];
+	}
+	ASSERT_EQ(all_special_flags, VM_SPECIAL);
+
+	/*
+	 * 01234
+	 * AAA
+	 */
+	vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	ASSERT_NE(vma_left, NULL);
+
+	/* 1. Set up new VMA with special flag that would otherwise merge. */
+
+	/*
+	 * 01234
+	 * AAA*
+	 *
+	 * This should merge if not for the VM_SPECIAL flag.
+	 */
+	vmg_set_range(&vmg, 0x3000, 0x4000, 3, flags);
+	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+		vm_flags_t special_flag = special_flags[i];
+
+		vma_left->__vm_flags = flags | special_flag;
+		vmg.flags = flags | special_flag;
+		vma = merge_new(&vmg);
+		ASSERT_EQ(vma, NULL);
+	}
+
+	/* 2. Modify VMA with special flag that would otherwise merge. */
+
+	/*
+	 * 01234
+	 * AAAB
+	 *
+	 * Create a VMA to modify.
+	 */
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags);
+	ASSERT_NE(vma, NULL);
+	vmg.vma = vma;
+
+	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+		vm_flags_t special_flag = special_flags[i];
+
+		vma_left->__vm_flags = flags | special_flag;
+		vmg.flags = flags | special_flag;
+		vma = merge_existing(&vmg);
+		ASSERT_EQ(vma, NULL);
+	}
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_vma_merge_with_close(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	const struct vm_operations_struct vm_ops = {
+		.close = dummy_close,
+	};
+	struct vm_area_struct *vma_next =
+		alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags);
+	struct vm_area_struct *vma;
+
+	/*
+	 * When we merge VMAs we sometimes have to delete others as part of the
+	 * operation.
+	 *
+	 * Considering the two possible adjacent VMAs to which a VMA can be
+	 * merged:
+	 *
+	 * [ prev ][ vma ][ next ]
+	 *
+	 * In no case will we need to delete prev. If the operation is
+	 * mergeable, then prev will be extended with one or both of vma and
+	 * next deleted.
+	 *
+	 * As a result, during initial mergeability checks, only
+	 * can_vma_merge_before() (which implies the VMA being merged with is
+	 * 'next' as shown above) bothers to check to see whether the next VMA
+	 * has a vm_ops->close() callback that will need to be called when
+	 * removed.
+	 *
+	 * If it does, then we cannot merge as the resources that the close()
+	 * operation potentially clears down are tied only to the existing VMA
+	 * range and we have no way of extending those to the nearly merged one.
+	 *
+	 * We must consider two scenarios:
+	 *
+	 * A.
+	 *
+	 * vm_ops->close:     -       -    !NULL
+	 *                 [ prev ][ vma ][ next ]
+	 *
+	 * Where prev may or may not be present/mergeable.
+	 *
+	 * This is picked up by a specific check in can_vma_merge_before().
+	 *
+	 * B.
+	 *
+	 * vm_ops->close:     -     !NULL
+	 *                 [ prev ][ vma ]
+	 *
+	 * Where prev and vma are present and mergeable.
+	 *
+	 * This is picked up by a specific check in the modified VMA merge.
+	 *
+	 * IMPORTANT NOTE: We make the assumption that the following case:
+	 *
+	 *    -     !NULL   NULL
+	 * [ prev ][ vma ][ next ]
+	 *
+	 * Cannot occur, because vma->vm_ops being the same implies the same
+	 * vma->vm_file, and therefore this would mean that next->vm_ops->close
+	 * would be set too, and thus scenario A would pick this up.
+	 */
+
+	ASSERT_NE(vma_next, NULL);
+
+	/*
+	 * SCENARIO A
+	 *
+	 * 0123
+	 *  *N
+	 */
+
+	/* Make the next VMA have a close() callback. */
+	vma_next->vm_ops = &vm_ops;
+
+	/* Our proposed VMA has characteristics that would otherwise be merged. */
+	vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+
+	/* The next VMA having a close() operator should cause the merge to fail.*/
+	ASSERT_EQ(merge_new(&vmg), NULL);
+
+	/* Now create the VMA so we can merge via modified flags */
+	vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+	vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags);
+	vmg.vma = vma;
+
+	/*
+	 * The VMA being modified in a way that would otherwise merge should
+	 * also fail.
+	 */
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+
+	/* SCENARIO B
+	 *
+	 * 0123
+	 * P*
+	 *
+	 * In order for this scenario to trigger, the VMA currently being
+	 * modified must also have a .close().
+	 */
+
+	/* Reset VMG state. */
+	vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+	/*
+	 * Make next unmergeable, and don't let the scenario A check pick this
+	 * up, we want to reproduce scenario B only.
+	 */
+	vma_next->vm_ops = NULL;
+	vma_next->__vm_flags &= ~VM_MAYWRITE;
+	/* Allocate prev. */
+	vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags);
+	/* Assign a vm_ops->close() function to VMA explicitly. */
+	vma->vm_ops = &vm_ops;
+	vmg.vma = vma;
+	/* Make sure merge does not occur. */
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_vma_merge_new_with_close(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
+	struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, flags);
+	const struct vm_operations_struct vm_ops = {
+		.close = dummy_close,
+	};
+	struct vm_area_struct *vma;
+
+	/*
+	 * We should allow the partial merge of a proposed new VMA if the
+	 * surrounding VMAs have vm_ops->close() hooks (but are otherwise
+	 * compatible), e.g.:
+	 *
+	 *        New VMA
+	 *    A  v-------v  B
+	 * |-----|       |-----|
+	 *  close         close
+	 *
+	 * Since the rule is to not DELETE a VMA with a close operation, this
+	 * should be permitted, only rather than expanding A and deleting B, we
+	 * should simply expand A and leave B intact, e.g.:
+	 *
+	 *        New VMA
+	 *       A          B
+	 * |------------||-----|
+	 *  close         close
+	 */
+
+	/* Have prev and next have a vm_ops->close() hook. */
+	vma_prev->vm_ops = &vm_ops;
+	vma_next->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x2000, 0x5000, 2, flags);
+	vma = merge_new(&vmg);
+	ASSERT_NE(vma, NULL);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x5000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->vm_ops, &vm_ops);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 2);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_merge_existing(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vm_area_struct *vma, *vma_prev, *vma_next;
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+
+	/*
+	 * Merge right case - partial span.
+	 *
+	 *    <->
+	 * 0123456789
+	 *   VVVVNNN
+	 *            ->
+	 * 0123456789
+	 *   VNNNNNN
+	 */
+	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+	vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
+	vmg.vma = vma;
+	vmg.prev = vma;
+	vma->anon_vma = &dummy_anon_vma;
+	ASSERT_EQ(merge_existing(&vmg), vma_next);
+	ASSERT_EQ(vma_next->vm_start, 0x3000);
+	ASSERT_EQ(vma_next->vm_end, 0x9000);
+	ASSERT_EQ(vma_next->vm_pgoff, 3);
+	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
+	ASSERT_EQ(vma->vm_start, 0x2000);
+	ASSERT_EQ(vma->vm_end, 0x3000);
+	ASSERT_EQ(vma->vm_pgoff, 2);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_TRUE(vma_write_started(vma_next));
+	ASSERT_EQ(mm.map_count, 2);
+
+	/* Clear down and reset. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * Merge right case - full span.
+	 *
+	 *   <-->
+	 * 0123456789
+	 *   VVVVNNN
+	 *            ->
+	 * 0123456789
+	 *   NNNNNNN
+	 */
+	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+	vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags);
+	vmg.vma = vma;
+	vma->anon_vma = &dummy_anon_vma;
+	ASSERT_EQ(merge_existing(&vmg), vma_next);
+	ASSERT_EQ(vma_next->vm_start, 0x2000);
+	ASSERT_EQ(vma_next->vm_end, 0x9000);
+	ASSERT_EQ(vma_next->vm_pgoff, 2);
+	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma_next));
+	ASSERT_EQ(mm.map_count, 1);
+
+	/* Clear down and reset. We should have deleted vma. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+	/*
+	 * Merge left case - partial span.
+	 *
+	 *    <->
+	 * 0123456789
+	 * PPPVVVV
+	 *            ->
+	 * 0123456789
+	 * PPPPPPV
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+	vma->anon_vma = &dummy_anon_vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x6000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_EQ(vma->vm_start, 0x6000);
+	ASSERT_EQ(vma->vm_end, 0x7000);
+	ASSERT_EQ(vma->vm_pgoff, 6);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 2);
+
+	/* Clear down and reset. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * Merge left case - full span.
+	 *
+	 *    <-->
+	 * 0123456789
+	 * PPPVVVV
+	 *            ->
+	 * 0123456789
+	 * PPPPPPP
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+	vma->anon_vma = &dummy_anon_vma;
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x7000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_EQ(mm.map_count, 1);
+
+	/* Clear down and reset. We should have deleted vma. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+	/*
+	 * Merge both case.
+	 *
+	 *    <-->
+	 * 0123456789
+	 * PPPVVVVNNN
+	 *             ->
+	 * 0123456789
+	 * PPPPPPPPPP
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
+	vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+	vma->anon_vma = &dummy_anon_vma;
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x9000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_EQ(mm.map_count, 1);
+
+	/* Clear down and reset. We should have deleted prev and next. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+	/*
+	 * Non-merge ranges. the modified VMA merge operation assumes that the
+	 * caller always specifies ranges within the input VMA so we need only
+	 * examine these cases.
+	 *
+	 *     -
+	 *      -
+	 *       -
+	 *     <->
+	 *     <>
+	 *      <>
+	 * 0123456789a
+	 * PPPVVVVVNNN
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, flags);
+
+	vmg_set_range(&vmg, 0x4000, 0x5000, 4, flags);
+	vmg.prev = vma;
+	vmg.vma = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+
+	vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags);
+	vmg.prev = vma;
+	vmg.vma = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+
+	vmg_set_range(&vmg, 0x6000, 0x7000, 6, flags);
+	vmg.prev = vma;
+	vmg.vma = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+
+	vmg_set_range(&vmg, 0x4000, 0x7000, 4, flags);
+	vmg.prev = vma;
+	vmg.vma = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+
+	vmg_set_range(&vmg, 0x4000, 0x6000, 4, flags);
+	vmg.prev = vma;
+	vmg.vma = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+
+	vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags);
+	vmg.prev = vma;
+	vmg.vma = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
+
+	return true;
+}
+
+static bool test_anon_vma_non_mergeable(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vm_area_struct *vma, *vma_prev, *vma_next;
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain1 = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain2 = {
+		.anon_vma = &dummy_anon_vma,
+	};
+
+	/*
+	 * In the case of modified VMA merge, merging both left and right VMAs
+	 * but where prev and next have incompatible anon_vma objects, we revert
+	 * to a merge of prev and VMA:
+	 *
+	 *    <-->
+	 * 0123456789
+	 * PPPVVVVNNN
+	 *            ->
+	 * 0123456789
+	 * PPPPPPPNNN
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
+
+	/*
+	 * Give both prev and next single anon_vma_chain fields, so they will
+	 * merge with the NULL vmg->anon_vma.
+	 *
+	 * However, when prev is compared to next, the merge should fail.
+	 */
+
+	INIT_LIST_HEAD(&vma_prev->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain);
+	ASSERT_TRUE(list_is_singular(&vma_prev->anon_vma_chain));
+	vma_prev->anon_vma = &dummy_anon_vma;
+	ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_prev->anon_vma, vma_prev));
+
+	INIT_LIST_HEAD(&vma_next->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain);
+	ASSERT_TRUE(list_is_singular(&vma_next->anon_vma_chain));
+	vma_next->anon_vma = (struct anon_vma *)2;
+	ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_next->anon_vma, vma_next));
+
+	ASSERT_FALSE(is_mergeable_anon_vma(vma_prev->anon_vma, vma_next->anon_vma, NULL));
+
+	vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x7000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_FALSE(vma_write_started(vma_next));
+
+	/* Clear down and reset. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * Now consider the new VMA case. This is equivalent, only adding a new
+	 * VMA in a gap between prev and next.
+	 *
+	 *    <-->
+	 * 0123456789
+	 * PPP****NNN
+	 *            ->
+	 * 0123456789
+	 * PPPPPPPNNN
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
+
+	INIT_LIST_HEAD(&vma_prev->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain);
+	vma_prev->anon_vma = (struct anon_vma *)1;
+
+	INIT_LIST_HEAD(&vma_next->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain);
+	vma_next->anon_vma = (struct anon_vma *)2;
+
+	vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
+	vmg.prev = vma_prev;
+
+	ASSERT_EQ(merge_new(&vmg), vma_prev);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x7000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_FALSE(vma_write_started(vma_next));
+
+	/* Final cleanup. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	return true;
+}
+
+static bool test_dup_anon_vma(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct vm_area_struct *vma_prev, *vma_next, *vma;
+
+	reset_dummy_anon_vma();
+
+	/*
+	 * Expanding a VMA delete the next one duplicates next's anon_vma and
+	 * assigns it to the expanded VMA.
+	 *
+	 * This covers new VMA merging, as these operations amount to a VMA
+	 * expand.
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_next->anon_vma = &dummy_anon_vma;
+
+	vmg_set_range(&vmg, 0, 0x5000, 0, flags);
+	vmg.vma = vma_prev;
+	vmg.next = vma_next;
+
+	ASSERT_EQ(expand_existing(&vmg), 0);
+
+	/* Will have been cloned. */
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
+
+	/* Cleanup ready for next run. */
+	cleanup_mm(&mm, &vmi);
+
+	/*
+	 * next has anon_vma, we assign to prev.
+	 *
+	 *         |<----->|
+	 * |-------*********-------|
+	 *   prev     vma     next
+	 *  extend   delete  delete
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags);
+
+	/* Initialise avc so mergeability check passes. */
+	INIT_LIST_HEAD(&vma_next->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain);
+
+	vma_next->anon_vma = &dummy_anon_vma;
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x8000);
+
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
+
+	cleanup_mm(&mm, &vmi);
+
+	/*
+	 * vma has anon_vma, we assign to prev.
+	 *
+	 *         |<----->|
+	 * |-------*********-------|
+	 *   prev     vma     next
+	 *  extend   delete  delete
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags);
+
+	vma->anon_vma = &dummy_anon_vma;
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x8000);
+
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
+
+	cleanup_mm(&mm, &vmi);
+
+	/*
+	 * vma has anon_vma, we assign to prev.
+	 *
+	 *         |<----->|
+	 * |-------*************
+	 *   prev       vma
+	 *  extend shrink/delete
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags);
+
+	vma->anon_vma = &dummy_anon_vma;
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x5000);
+
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
+
+	cleanup_mm(&mm, &vmi);
+
+	/*
+	 * vma has anon_vma, we assign to next.
+	 *
+	 *     |<----->|
+	 * *************-------|
+	 *      vma       next
+	 * shrink/delete extend
+	 */
+
+	vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags);
+
+	vma->anon_vma = &dummy_anon_vma;
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma;
+	vmg.vma = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_next);
+
+	ASSERT_EQ(vma_next->vm_start, 0x3000);
+	ASSERT_EQ(vma_next->vm_end, 0x8000);
+
+	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_next->anon_vma->was_cloned);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_vmi_prealloc_fail(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct vm_area_struct *vma_prev, *vma;
+
+	/*
+	 * We are merging vma into prev, with vma possessing an anon_vma, which
+	 * will be duplicated. We cause the vmi preallocation to fail and assert
+	 * the duplicated anon_vma is unlinked.
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma->anon_vma = &dummy_anon_vma;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+
+	fail_prealloc = true;
+
+	/* This will cause the merge to fail. */
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	/* We will already have assigned the anon_vma. */
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	/* And it was both cloned and unlinked. */
+	ASSERT_TRUE(dummy_anon_vma.was_cloned);
+	ASSERT_TRUE(dummy_anon_vma.was_unlinked);
+
+	cleanup_mm(&mm, &vmi); /* Resets fail_prealloc too. */
+
+	/*
+	 * We repeat the same operation for expanding a VMA, which is what new
+	 * VMA merging ultimately uses too. This asserts that unlinking is
+	 * performed in this case too.
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma->anon_vma = &dummy_anon_vma;
+
+	vmg_set_range(&vmg, 0, 0x5000, 3, flags);
+	vmg.vma = vma_prev;
+	vmg.next = vma;
+
+	fail_prealloc = true;
+	ASSERT_EQ(expand_existing(&vmg), -ENOMEM);
+
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(dummy_anon_vma.was_cloned);
+	ASSERT_TRUE(dummy_anon_vma.was_unlinked);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_merge_extend(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0x1000);
+	struct vm_area_struct *vma;
+
+	vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags);
+	alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags);
+
+	/*
+	 * Extend a VMA into the gap between itself and the following VMA.
+	 * This should result in a merge.
+	 *
+	 * <->
+	 * *  *
+	 *
+	 */
+
+	ASSERT_EQ(vma_merge_extend(&vmi, vma, 0x2000), vma);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x4000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 1);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_copy_vma(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	bool need_locks = false;
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vm_area_struct *vma, *vma_new, *vma_next;
+
+	/* Move backwards and do not merge. */
+
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks);
+
+	ASSERT_NE(vma_new, vma);
+	ASSERT_EQ(vma_new->vm_start, 0);
+	ASSERT_EQ(vma_new->vm_end, 0x2000);
+	ASSERT_EQ(vma_new->vm_pgoff, 0);
+
+	cleanup_mm(&mm, &vmi);
+
+	/* Move a VMA into position next to another and merge the two. */
+
+	vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, flags);
+	vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks);
+
+	ASSERT_EQ(vma_new, vma_next);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
 int main(void)
 {
 	int num_tests = 0, num_fail = 0;
@@ -193,11 +1453,23 @@ int main(void)
 		}							\
 	} while (0)
 
+	/* Very simple tests to kick the tyres. */
 	TEST(simple_merge);
 	TEST(simple_modify);
 	TEST(simple_expand);
 	TEST(simple_shrink);
 
+	TEST(merge_new);
+	TEST(vma_merge_special_flags);
+	TEST(vma_merge_with_close);
+	TEST(vma_merge_new_with_close);
+	TEST(merge_existing);
+	TEST(anon_vma_non_mergeable);
+	TEST(dup_anon_vma);
+	TEST(vmi_prealloc_fail);
+	TEST(merge_extend);
+	TEST(copy_vma);
+
 #undef TEST
 
 	printf("%d tests run, %d passed, %d failed.\n",
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 093560e5b2ac..a3c262c6eb73 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -81,8 +81,6 @@
 
 #define AS_MM_ALL_LOCKS 2
 
-#define current NULL
-
 /* We hardcode this for now. */
 #define sysctl_max_map_count 0x1000000UL
 
@@ -92,6 +90,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
 typedef unsigned long vm_flags_t;
 typedef __bitwise unsigned int vm_fault_t;
 
+/*
+ * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...)
+ * either way :)
+ */
+#define pr_warn_once pr_err
+
 typedef struct refcount_struct {
 	atomic_t refs;
 } refcount_t;
@@ -100,9 +104,30 @@ struct kref {
 	refcount_t refcount;
 };
 
+/*
+ * Define the task command name length as enum, then it can be visible to
+ * BPF programs.
+ */
+enum {
+	TASK_COMM_LEN = 16,
+};
+
+struct task_struct {
+	char comm[TASK_COMM_LEN];
+	pid_t pid;
+	struct mm_struct *mm;
+};
+
+struct task_struct *get_current(void);
+#define current get_current()
+
 struct anon_vma {
 	struct anon_vma *root;
 	struct rb_root_cached rb_root;
+
+	/* Test fields. */
+	bool was_cloned;
+	bool was_unlinked;
 };
 
 struct anon_vma_chain {
@@ -682,13 +707,21 @@ static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct
 	return 0;
 }
 
-static inline int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *)
+static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
+	/* For testing purposes. We indicate that an anon_vma has been cloned. */
+	if (src->anon_vma != NULL) {
+		dst->anon_vma = src->anon_vma;
+		dst->anon_vma->was_cloned = true;
+	}
+
 	return 0;
 }
 
-static inline void vma_start_write(struct vm_area_struct *)
+static inline void vma_start_write(struct vm_area_struct *vma)
 {
+	/* Used to indicate to tests that a write operation has begun. */
+	vma->vm_lock_seq++;
 }
 
 static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -759,8 +792,10 @@ static inline void vma_assert_write_locked(struct vm_area_struct *)
 {
 }
 
-static inline void unlink_anon_vmas(struct vm_area_struct *)
+static inline void unlink_anon_vmas(struct vm_area_struct *vma)
 {
+	/* For testing purposes, indicate that the anon_vma was unlinked. */
+	vma->anon_vma->was_unlinked = true;
 }
 
 static inline void anon_vma_unlock_write(struct anon_vma *)
-- 
cgit v1.2.3


From 2f1c6611b0a89afcb8641471af5f223c9caa01e0 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 30 Aug 2024 19:10:15 +0100
Subject: mm: introduce vma_merge_struct and abstract vma_merge(),vma_modify()

Rather than passing around huge numbers of parameters to numerous helper
functions, abstract them into a single struct that we thread through the
operation, the vma_merge_struct ('vmg').

Adjust vma_merge() and vma_modify() to accept this parameter, as well as
predicate functions can_vma_merge_before(), can_vma_merge_after(), and the
vma_modify_...() helper functions.

Also introduce VMG_STATE() and VMG_VMA_STATE() helper macros to allow for
easy vmg declaration.

We additionally remove the requirement that vma_merge() is passed a VMA
object representing the candidate new VMA.  Previously it used this to
obtain the mm_struct, file and anon_vma properties of the proposed range
(a rather confusing state of affairs), which are now provided by the vmg
directly.

We also remove the pgoff calculation previously performed vma_modify(),
and instead calculate this in VMG_VMA_STATE() via the vma_pgoff_offset()
helper.

Link: https://lkml.kernel.org/r/a955aad09d81329f6fbeb636b2dd10cde7b73dab.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c               |  76 ++++++++++--------
 mm/vma.c                | 207 +++++++++++++++++++++++++++++-------------------
 mm/vma.h                | 127 +++++++++++++++++------------
 tools/testing/vma/vma.c |  43 +---------
 4 files changed, 246 insertions(+), 207 deletions(-)

(limited to 'tools')

diff --git a/mm/mmap.c b/mm/mmap.c
index fc726c4e98be..ca9c6939638b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1373,10 +1373,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long end = addr + len;
 	unsigned long merge_start = addr, merge_end = end;
 	bool writable_file_mapping = false;
-	pgoff_t vm_pgoff;
 	int error = -ENOMEM;
 	VMA_ITERATOR(vmi, mm, addr);
+	VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
 
+	vmg.file = file;
 	/* Find the first overlapping VMA */
 	vma = vma_find(&vmi, end);
 	init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false);
@@ -1389,12 +1390,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		if (error)
 			goto gather_failed;
 
-		next = vms.next;
-		prev = vms.prev;
+		next = vmg.next = vms.next;
+		prev = vmg.prev = vms.prev;
 		vma = NULL;
 	} else {
-		next = vma_next(&vmi);
-		prev = vma_prev(&vmi);
+		next = vmg.next = vma_next(&vmi);
+		prev = vmg.prev = vma_prev(&vmi);
 		if (prev)
 			vma_iter_next_range(&vmi);
 	}
@@ -1414,6 +1415,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 
 		vms.nr_accounted = 0;
 		vm_flags |= VM_ACCOUNT;
+		vmg.flags = vm_flags;
 	}
 
 	if (vm_flags & VM_SPECIAL)
@@ -1422,28 +1424,31 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/* Attempt to expand an old mapping */
 	/* Check next */
 	if (next && next->vm_start == end && !vma_policy(next) &&
-	    can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
-				 NULL_VM_UFFD_CTX, NULL)) {
+	    can_vma_merge_before(&vmg)) {
 		merge_end = next->vm_end;
 		vma = next;
-		vm_pgoff = next->vm_pgoff - pglen;
+		vmg.pgoff = next->vm_pgoff - pglen;
+		/*
+		 * We set this here so if we will merge with the previous VMA in
+		 * the code below, can_vma_merge_after() ensures anon_vma
+		 * compatibility between prev and next.
+		 */
+		vmg.anon_vma = vma->anon_vma;
+		vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
 	}
 
 	/* Check prev */
 	if (prev && prev->vm_end == addr && !vma_policy(prev) &&
-	    (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
-				       pgoff, vma->vm_userfaultfd_ctx, NULL) :
-		   can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
-				       NULL_VM_UFFD_CTX, NULL))) {
+	    can_vma_merge_after(&vmg)) {
 		merge_start = prev->vm_start;
 		vma = prev;
-		vm_pgoff = prev->vm_pgoff;
+		vmg.pgoff = prev->vm_pgoff;
 		vma_prev(&vmi); /* Equivalent to going to the previous range */
 	}
 
 	if (vma) {
 		/* Actually expand, if possible */
-		if (!vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
+		if (!vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
 			khugepaged_enter_vma(vma, vm_flags);
 			goto expanded;
 		}
@@ -1774,26 +1779,29 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	 * Expand the existing vma if possible; Note that singular lists do not
 	 * occur after forking, so the expand will only happen on new VMAs.
 	 */
-	if (vma && vma->vm_end == addr && !vma_policy(vma) &&
-	    can_vma_merge_after(vma, flags, NULL, NULL,
-				addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
-		vma_iter_config(vmi, vma->vm_start, addr + len);
-		if (vma_iter_prealloc(vmi, vma))
-			goto unacct_fail;
-
-		vma_start_write(vma);
-
-		init_vma_prep(&vp, vma);
-		vma_prepare(&vp);
-		vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
-		vma->vm_end = addr + len;
-		vm_flags_set(vma, VM_SOFTDIRTY);
-		vma_iter_store(vmi, vma);
-
-		vma_complete(&vp, vmi, mm);
-		validate_mm(mm);
-		khugepaged_enter_vma(vma, flags);
-		goto out;
+	if (vma && vma->vm_end == addr && !vma_policy(vma)) {
+		VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
+
+		vmg.prev = vma;
+		if (can_vma_merge_after(&vmg)) {
+			vma_iter_config(vmi, vma->vm_start, addr + len);
+			if (vma_iter_prealloc(vmi, vma))
+				goto unacct_fail;
+
+			vma_start_write(vma);
+
+			init_vma_prep(&vp, vma);
+			vma_prepare(&vp);
+			vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+			vma->vm_end = addr + len;
+			vm_flags_set(vma, VM_SOFTDIRTY);
+			vma_iter_store(vmi, vma);
+
+			vma_complete(&vp, vmi, mm);
+			validate_mm(mm);
+			khugepaged_enter_vma(vma, flags);
+			goto out;
+		}
 	}
 
 	if (vma)
diff --git a/mm/vma.c b/mm/vma.c
index 1736bb237b2c..6be645240f07 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -7,16 +7,18 @@
 #include "vma_internal.h"
 #include "vma.h"
 
-/*
- * If the vma has a ->close operation then the driver probably needs to release
- * per-vma resources, so we don't attempt to merge those if the caller indicates
- * the current vma may be removed as part of the merge.
- */
-static inline bool is_mergeable_vma(struct vm_area_struct *vma,
-		struct file *file, unsigned long vm_flags,
-		struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
-		struct anon_vma_name *anon_name, bool may_remove_vma)
+static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
 {
+	struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
+	/*
+	 * If the vma has a ->close operation then the driver probably needs to
+	 * release per-vma resources, so we don't attempt to merge those if the
+	 * caller indicates the current vma may be removed as part of the merge,
+	 * which is the case if we are attempting to merge the next VMA into
+	 * this one.
+	 */
+	bool may_remove_vma = merge_next;
+
 	/*
 	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
 	 * match the flags but dirty bit -- the caller should mark
@@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma,
 	 * the kernel to generate new VMAs when old one could be
 	 * extended instead.
 	 */
-	if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
+	if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
 		return false;
-	if (vma->vm_file != file)
+	if (vma->vm_file != vmg->file)
 		return false;
 	if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
 		return false;
-	if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+	if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
 		return false;
-	if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
+	if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
 		return false;
 	return true;
 }
@@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
  * We assume the vma may be removed as part of the merge.
  */
 bool
-can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-		struct anon_vma *anon_vma, struct file *file,
-		pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
-		struct anon_vma_name *anon_name)
+can_vma_merge_before(struct vma_merge_struct *vmg)
 {
-	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
-	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
-		if (vma->vm_pgoff == vm_pgoff)
+	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
+
+	if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
+	    is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+		if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
 			return true;
 	}
+
 	return false;
 }
 
@@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  *
  * We assume that vma is not removed as part of the merge.
  */
-bool
-can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-		struct anon_vma *anon_vma, struct file *file,
-		pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
-		struct anon_vma_name *anon_name)
+bool can_vma_merge_after(struct vma_merge_struct *vmg)
 {
-	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
-	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
-		pgoff_t vm_pglen;
-
-		vm_pglen = vma_pages(vma);
-		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
+	if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
+	    is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+		if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
 			return true;
 	}
 	return false;
@@ -1017,16 +1012,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
  * **** is not represented - it will be merged and the vma containing the
  *      area is returned, or the function will return NULL
  */
-static struct vm_area_struct
-*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
-	   struct vm_area_struct *src, unsigned long addr, unsigned long end,
-	   unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
-	   struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
-	   struct anon_vma_name *anon_name)
+static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
 {
-	struct mm_struct *mm = src->vm_mm;
-	struct anon_vma *anon_vma = src->anon_vma;
-	struct file *file = src->vm_file;
+	struct mm_struct *mm = vmg->mm;
+	struct vm_area_struct *prev = vmg->prev;
 	struct vm_area_struct *curr, *next, *res;
 	struct vm_area_struct *vma, *adjust, *remove, *remove2;
 	struct vm_area_struct *anon_dup = NULL;
@@ -1036,16 +1025,18 @@ static struct vm_area_struct
 	bool merge_prev = false;
 	bool merge_next = false;
 	bool vma_expanded = false;
+	unsigned long addr = vmg->start;
+	unsigned long end = vmg->end;
 	unsigned long vma_start = addr;
 	unsigned long vma_end = end;
-	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
+	pgoff_t pglen = PHYS_PFN(end - addr);
 	long adj_start = 0;
 
 	/*
 	 * We later require that vma->vm_flags == vm_flags,
 	 * so this tests vma->vm_flags & VM_SPECIAL, too.
 	 */
-	if (vm_flags & VM_SPECIAL)
+	if (vmg->flags & VM_SPECIAL)
 		return NULL;
 
 	/* Does the input range span an existing VMA? (cases 5 - 8) */
@@ -1053,27 +1044,26 @@ static struct vm_area_struct
 
 	if (!curr ||			/* cases 1 - 4 */
 	    end == curr->vm_end)	/* cases 6 - 8, adjacent VMA */
-		next = vma_lookup(mm, end);
+		next = vmg->next = vma_lookup(mm, end);
 	else
-		next = NULL;		/* case 5 */
+		next = vmg->next = NULL;	/* case 5 */
 
 	if (prev) {
 		vma_start = prev->vm_start;
 		vma_pgoff = prev->vm_pgoff;
 
 		/* Can we merge the predecessor? */
-		if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
-		    && can_vma_merge_after(prev, vm_flags, anon_vma, file,
-					   pgoff, vm_userfaultfd_ctx, anon_name)) {
+		if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy)
+		    && can_vma_merge_after(vmg)) {
+
 			merge_prev = true;
-			vma_prev(vmi);
+			vma_prev(vmg->vmi);
 		}
 	}
 
 	/* Can we merge the successor? */
-	if (next && mpol_equal(policy, vma_policy(next)) &&
-	    can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
-				 vm_userfaultfd_ctx, anon_name)) {
+	if (next && mpol_equal(vmg->policy, vma_policy(next)) &&
+	    can_vma_merge_before(vmg)) {
 		merge_next = true;
 	}
 
@@ -1164,13 +1154,13 @@ static struct vm_area_struct
 		vma_expanded = true;
 
 	if (vma_expanded) {
-		vma_iter_config(vmi, vma_start, vma_end);
+		vma_iter_config(vmg->vmi, vma_start, vma_end);
 	} else {
-		vma_iter_config(vmi, adjust->vm_start + adj_start,
+		vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
 				adjust->vm_end);
 	}
 
-	if (vma_iter_prealloc(vmi, vma))
+	if (vma_iter_prealloc(vmg->vmi, vma))
 		goto prealloc_fail;
 
 	init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
@@ -1182,20 +1172,20 @@ static struct vm_area_struct
 	vma_set_range(vma, vma_start, vma_end, vma_pgoff);
 
 	if (vma_expanded)
-		vma_iter_store(vmi, vma);
+		vma_iter_store(vmg->vmi, vma);
 
 	if (adj_start) {
 		adjust->vm_start += adj_start;
 		adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
 		if (adj_start < 0) {
 			WARN_ON(vma_expanded);
-			vma_iter_store(vmi, next);
+			vma_iter_store(vmg->vmi, next);
 		}
 	}
 
-	vma_complete(&vp, vmi, mm);
+	vma_complete(&vp, vmg->vmi, mm);
 	validate_mm(mm);
-	khugepaged_enter_vma(res, vm_flags);
+	khugepaged_enter_vma(res, vmg->flags);
 	return res;
 
 prealloc_fail:
@@ -1203,8 +1193,8 @@ prealloc_fail:
 		unlink_anon_vmas(anon_dup);
 
 anon_vma_fail:
-	vma_iter_set(vmi, addr);
-	vma_iter_load(vmi);
+	vma_iter_set(vmg->vmi, addr);
+	vma_iter_load(vmg->vmi);
 	return NULL;
 }
 
@@ -1221,32 +1211,27 @@ anon_vma_fail:
  * The function returns either the merged VMA, the original VMA if a split was
  * required instead, or an error if the split failed.
  */
-struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
-				  struct vm_area_struct *prev,
-				  struct vm_area_struct *vma,
-				  unsigned long start, unsigned long end,
-				  unsigned long vm_flags,
-				  struct mempolicy *policy,
-				  struct vm_userfaultfd_ctx uffd_ctx,
-				  struct anon_vma_name *anon_name)
+static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
 {
-	pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+	struct vm_area_struct *vma = vmg->vma;
 	struct vm_area_struct *merged;
 
-	merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
-			   pgoff, policy, uffd_ctx, anon_name);
+	/* First, try to merge. */
+	merged = vma_merge(vmg);
 	if (merged)
 		return merged;
 
-	if (vma->vm_start < start) {
-		int err = split_vma(vmi, vma, start, 1);
+	/* Split any preceding portion of the VMA. */
+	if (vma->vm_start < vmg->start) {
+		int err = split_vma(vmg->vmi, vma, vmg->start, 1);
 
 		if (err)
 			return ERR_PTR(err);
 	}
 
-	if (vma->vm_end > end) {
-		int err = split_vma(vmi, vma, end, 0);
+	/* Split any trailing portion of the VMA. */
+	if (vma->vm_end > vmg->end) {
+		int err = split_vma(vmg->vmi, vma, vmg->end, 0);
 
 		if (err)
 			return ERR_PTR(err);
@@ -1255,6 +1240,65 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
 	return vma;
 }
 
+struct vm_area_struct *vma_modify_flags(
+	struct vma_iterator *vmi, struct vm_area_struct *prev,
+	struct vm_area_struct *vma, unsigned long start, unsigned long end,
+	unsigned long new_flags)
+{
+	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+	vmg.flags = new_flags;
+
+	return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+		       struct vm_area_struct *prev,
+		       struct vm_area_struct *vma,
+		       unsigned long start,
+		       unsigned long end,
+		       unsigned long new_flags,
+		       struct anon_vma_name *new_name)
+{
+	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+	vmg.flags = new_flags;
+	vmg.anon_name = new_name;
+
+	return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+		   struct vm_area_struct *prev,
+		   struct vm_area_struct *vma,
+		   unsigned long start, unsigned long end,
+		   struct mempolicy *new_pol)
+{
+	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+	vmg.policy = new_pol;
+
+	return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+		       struct vm_area_struct *prev,
+		       struct vm_area_struct *vma,
+		       unsigned long start, unsigned long end,
+		       unsigned long new_flags,
+		       struct vm_userfaultfd_ctx new_ctx)
+{
+	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+	vmg.flags = new_flags;
+	vmg.uffd_ctx = new_ctx;
+
+	return vma_modify(&vmg);
+}
+
 /*
  * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
  * must ensure that [start, end) does not overlap any existing VMA.
@@ -1264,8 +1308,11 @@ struct vm_area_struct
 		   struct vm_area_struct *vma, unsigned long start,
 		   unsigned long end, pgoff_t pgoff)
 {
-	return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
-			 vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+	vmg.pgoff = pgoff;
+
+	return vma_merge(&vmg);
 }
 
 /*
@@ -1276,12 +1323,10 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
 					struct vm_area_struct *vma,
 					unsigned long delta)
 {
-	pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
+	VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
 
 	/* vma is specified as prev, so case 1 or 2 will apply. */
-	return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
-			 vma->vm_flags, pgoff, vma_policy(vma),
-			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+	return vma_merge(&vmg);
 }
 
 void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
diff --git a/mm/vma.h b/mm/vma.h
index 2a3a5c89a33b..e761713f855f 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -52,6 +52,59 @@ struct vma_munmap_struct {
 	unsigned long data_vm;
 };
 
+/* Represents a VMA merge operation. */
+struct vma_merge_struct {
+	struct mm_struct *mm;
+	struct vma_iterator *vmi;
+	pgoff_t pgoff;
+	struct vm_area_struct *prev;
+	struct vm_area_struct *next; /* Modified by vma_merge(). */
+	struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
+	unsigned long start;
+	unsigned long end;
+	unsigned long flags;
+	struct file *file;
+	struct anon_vma *anon_vma;
+	struct mempolicy *policy;
+	struct vm_userfaultfd_ctx uffd_ctx;
+	struct anon_vma_name *anon_name;
+};
+
+/* Assumes addr >= vma->vm_start. */
+static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
+				       unsigned long addr)
+{
+	return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
+}
+
+#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_)	\
+	struct vma_merge_struct name = {				\
+		.mm = mm_,						\
+		.vmi = vmi_,						\
+		.start = start_,					\
+		.end = end_,						\
+		.flags = flags_,					\
+		.pgoff = pgoff_,					\
+	}
+
+#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_)	\
+	struct vma_merge_struct name = {			\
+		.mm = vma_->vm_mm,				\
+		.vmi = vmi_,					\
+		.prev = prev_,					\
+		.next = NULL,					\
+		.vma = vma_,					\
+		.start = start_,				\
+		.end = end_,					\
+		.flags = vma_->vm_flags,			\
+		.pgoff = vma_pgoff_offset(vma_, start_),	\
+		.file = vma_->vm_file,				\
+		.anon_vma = vma_->anon_vma,			\
+		.policy = vma_policy(vma_),			\
+		.uffd_ctx = vma_->vm_userfaultfd_ctx,		\
+		.anon_name = anon_vma_name(vma_),		\
+	}
+
 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
 void validate_mm(struct mm_struct *mm);
 #else
@@ -212,80 +265,52 @@ void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);
 void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct vm_area_struct *next);
 
-/* Required by mmap_region(). */
-bool
-can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-		struct anon_vma *anon_vma, struct file *file,
-		pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
-		struct anon_vma_name *anon_name);
-
-/* Required by mmap_region() and do_brk_flags(). */
-bool
-can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-		struct anon_vma *anon_vma, struct file *file,
-		pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
-		struct anon_vma_name *anon_name);
-
-struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
-				  struct vm_area_struct *prev,
-				  struct vm_area_struct *vma,
-				  unsigned long start, unsigned long end,
-				  unsigned long vm_flags,
-				  struct mempolicy *policy,
-				  struct vm_userfaultfd_ctx uffd_ctx,
-				  struct anon_vma_name *anon_name);
+/*
+ * Can we merge the VMA described by vmg into the following VMA vmg->next?
+ *
+ * Required by mmap_region().
+ */
+bool can_vma_merge_before(struct vma_merge_struct *vmg);
+
+/*
+ * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
+ *
+ * Required by mmap_region() and do_brk_flags().
+ */
+bool can_vma_merge_after(struct vma_merge_struct *vmg);
 
 /* We are about to modify the VMA's flags. */
-static inline struct vm_area_struct
-*vma_modify_flags(struct vma_iterator *vmi,
-		  struct vm_area_struct *prev,
-		  struct vm_area_struct *vma,
-		  unsigned long start, unsigned long end,
-		  unsigned long new_flags)
-{
-	return vma_modify(vmi, prev, vma, start, end, new_flags,
-			  vma_policy(vma), vma->vm_userfaultfd_ctx,
-			  anon_vma_name(vma));
-}
+struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+		struct vm_area_struct *prev, struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		unsigned long new_flags);
 
 /* We are about to modify the VMA's flags and/or anon_name. */
-static inline struct vm_area_struct
+struct vm_area_struct
 *vma_modify_flags_name(struct vma_iterator *vmi,
 		       struct vm_area_struct *prev,
 		       struct vm_area_struct *vma,
 		       unsigned long start,
 		       unsigned long end,
 		       unsigned long new_flags,
-		       struct anon_vma_name *new_name)
-{
-	return vma_modify(vmi, prev, vma, start, end, new_flags,
-			  vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
-}
+		       struct anon_vma_name *new_name);
 
 /* We are about to modify the VMA's memory policy. */
-static inline struct vm_area_struct
+struct vm_area_struct
 *vma_modify_policy(struct vma_iterator *vmi,
 		   struct vm_area_struct *prev,
 		   struct vm_area_struct *vma,
 		   unsigned long start, unsigned long end,
-		   struct mempolicy *new_pol)
-{
-	return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
-			  new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-}
+		   struct mempolicy *new_pol);
 
 /* We are about to modify the VMA's flags and/or uffd context. */
-static inline struct vm_area_struct
+struct vm_area_struct
 *vma_modify_flags_uffd(struct vma_iterator *vmi,
 		       struct vm_area_struct *prev,
 		       struct vm_area_struct *vma,
 		       unsigned long start, unsigned long end,
 		       unsigned long new_flags,
-		       struct vm_userfaultfd_ctx new_ctx)
-{
-	return vma_modify(vmi, prev, vma, start, end, new_flags,
-			  vma_policy(vma), new_ctx, anon_vma_name(vma));
-}
+		       struct vm_userfaultfd_ctx new_ctx);
 
 struct vm_area_struct
 *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 71bd30d5da81..7a3f59186464 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -22,26 +22,6 @@ static bool fail_prealloc;
  */
 #include "../../../mm/vma.c"
 
-/*
- * Temporarily forward-ported from a future in which vmg's are used for merging.
- */
-struct vma_merge_struct {
-	struct mm_struct *mm;
-	struct vma_iterator *vmi;
-	pgoff_t pgoff;
-	struct vm_area_struct *prev;
-	struct vm_area_struct *next; /* Modified by vma_merge(). */
-	struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
-	unsigned long start;
-	unsigned long end;
-	unsigned long flags;
-	struct file *file;
-	struct anon_vma *anon_vma;
-	struct mempolicy *policy;
-	struct vm_userfaultfd_ctx uffd_ctx;
-	struct anon_vma_name *anon_name;
-};
-
 const struct vm_operations_struct vma_dummy_vm_ops;
 static struct anon_vma dummy_anon_vma;
 
@@ -115,14 +95,6 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
 /* Helper function which provides a wrapper around a merge new VMA operation. */
 static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
 {
-	/* vma_merge() needs a VMA to determine mm, anon_vma, and file. */
-	struct vm_area_struct dummy = {
-		.vm_mm = vmg->mm,
-		.vm_flags = vmg->flags,
-		.anon_vma = vmg->anon_vma,
-		.vm_file = vmg->file,
-	};
-
 	/*
 	 * For convenience, get prev and next VMAs. Which the new VMA operation
 	 * requires.
@@ -131,8 +103,7 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
 	vmg->prev = vma_prev(vmg->vmi);
 
 	vma_iter_set(vmg->vmi, vmg->start);
-	return vma_merge_new_vma(vmg->vmi, vmg->prev, &dummy, vmg->start,
-				 vmg->end, vmg->pgoff);
+	return vma_merge(vmg);
 }
 
 /*
@@ -141,17 +112,7 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
  */
 static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
 {
-	/* vma_merge() needs a VMA to determine mm, anon_vma, and file. */
-	struct vm_area_struct dummy = {
-		.vm_mm = vmg->mm,
-		.vm_flags = vmg->flags,
-		.anon_vma = vmg->anon_vma,
-		.vm_file = vmg->file,
-	};
-
-	return vma_merge(vmg->vmi, vmg->prev, &dummy, vmg->start, vmg->end,
-			 vmg->flags, vmg->pgoff, vmg->policy, vmg->uffd_ctx,
-			 vmg->anon_name);
+	return vma_merge(vmg);
 }
 
 /*
-- 
cgit v1.2.3


From fc21959f74bc1138b28e90a02ec224ab8626111e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 30 Aug 2024 19:10:17 +0100
Subject: mm: abstract vma_expand() to use vma_merge_struct

The purpose of the vmg is to thread merge state through functions and
avoid egregious parameter lists.  We expand this to vma_expand(), which is
used for a number of merge cases.

Accordingly, adjust its callers, mmap_region() and relocate_vma_down(), to
use a vmg.

An added purpose of this change is the ability in a future commit to
perform all new VMA range merging using vma_expand().

Link: https://lkml.kernel.org/r/4bc8c9dbc9ca52452ef8e587b28fe555854ceb38.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c               | 15 ++++++++-------
 mm/vma.c                | 39 +++++++++++++++++----------------------
 mm/vma.h                |  5 +----
 tools/testing/vma/vma.c |  3 +--
 4 files changed, 27 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/mm/mmap.c b/mm/mmap.c
index 3af8459e4e88..2b3006efd3fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1371,7 +1371,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	struct ma_state mas_detach;
 	struct maple_tree mt_detach;
 	unsigned long end = addr + len;
-	unsigned long merge_start = addr, merge_end = end;
 	bool writable_file_mapping = false;
 	int error = -ENOMEM;
 	VMA_ITERATOR(vmi, mm, addr);
@@ -1424,8 +1423,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/* Attempt to expand an old mapping */
 	/* Check next */
 	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
-		merge_end = next->vm_end;
-		vma = next;
+		vmg.end = next->vm_end;
+		vma = vmg.vma = next;
 		vmg.pgoff = next->vm_pgoff - pglen;
 		/*
 		 * We set this here so if we will merge with the previous VMA in
@@ -1438,15 +1437,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 
 	/* Check prev */
 	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
-		merge_start = prev->vm_start;
-		vma = prev;
+		vmg.start = prev->vm_start;
+		vma = vmg.vma = prev;
 		vmg.pgoff = prev->vm_pgoff;
 		vma_prev(&vmi); /* Equivalent to going to the previous range */
 	}
 
 	if (vma) {
 		/* Actually expand, if possible */
-		if (!vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
+		if (!vma_expand(&vmg)) {
 			khugepaged_enter_vma(vma, vm_flags);
 			goto expanded;
 		}
@@ -2320,6 +2319,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
 	unsigned long new_start = old_start - shift;
 	unsigned long new_end = old_end - shift;
 	VMA_ITERATOR(vmi, mm, new_start);
+	VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
 	struct vm_area_struct *next;
 	struct mmu_gather tlb;
 
@@ -2336,7 +2336,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
 	/*
 	 * cover the whole range: [new_start, old_end)
 	 */
-	if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
+	vmg.vma = vma;
+	if (vma_expand(&vmg))
 		return -ENOMEM;
 
 	/*
diff --git a/mm/vma.c b/mm/vma.c
index 3284bb778c3d..d1033dade70e 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -467,30 +467,25 @@ void validate_mm(struct mm_struct *mm)
 /*
  * vma_expand - Expand an existing VMA
  *
- * @vmi: The vma iterator
- * @vma: The vma to expand
- * @start: The start of the vma
- * @end: The exclusive end of the vma
- * @pgoff: The page offset of vma
- * @next: The current of next vma.
+ * @vmg: Describes a VMA expansion operation.
  *
- * Expand @vma to @start and @end.  Can expand off the start and end.  Will
- * expand over @next if it's different from @vma and @end == @next->vm_end.
- * Checking if the @vma can expand and merge with @next needs to be handled by
- * the caller.
+ * Expand @vma to vmg->start and vmg->end.  Can expand off the start and end.
+ * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
+ * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
+ * vmg->next needs to be handled by the caller.
  *
  * Returns: 0 on success
  */
-int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
-	       unsigned long start, unsigned long end, pgoff_t pgoff,
-	       struct vm_area_struct *next)
+int vma_expand(struct vma_merge_struct *vmg)
 {
 	struct vm_area_struct *anon_dup = NULL;
 	bool remove_next = false;
+	struct vm_area_struct *vma = vmg->vma;
+	struct vm_area_struct *next = vmg->next;
 	struct vma_prepare vp;
 
 	vma_start_write(vma);
-	if (next && (vma != next) && (end == next->vm_end)) {
+	if (next && (vma != next) && (vmg->end == next->vm_end)) {
 		int ret;
 
 		remove_next = true;
@@ -503,21 +498,21 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
 	/* Not merging but overwriting any part of next is not handled. */
 	VM_WARN_ON(next && !vp.remove &&
-		  next != vma && end > next->vm_start);
+		  next != vma && vmg->end > next->vm_start);
 	/* Only handles expanding */
-	VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
+	VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
 
 	/* Note: vma iterator must be pointing to 'start' */
-	vma_iter_config(vmi, start, end);
-	if (vma_iter_prealloc(vmi, vma))
+	vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+	if (vma_iter_prealloc(vmg->vmi, vma))
 		goto nomem;
 
 	vma_prepare(&vp);
-	vma_adjust_trans_huge(vma, start, end, 0);
-	vma_set_range(vma, start, end, pgoff);
-	vma_iter_store(vmi, vma);
+	vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
+	vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
+	vma_iter_store(vmg->vmi, vma);
 
-	vma_complete(&vp, vmi, vma->vm_mm);
+	vma_complete(&vp, vmg->vmi, vma->vm_mm);
 	return 0;
 
 nomem:
diff --git a/mm/vma.h b/mm/vma.h
index e761713f855f..218d884ff5ff 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -128,10 +128,7 @@ void init_vma_prep(struct vma_prepare *vp,
 void vma_complete(struct vma_prepare *vp,
 		  struct vma_iterator *vmi, struct mm_struct *mm);
 
-int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
-	       unsigned long start, unsigned long end, pgoff_t pgoff,
-	       struct vm_area_struct *next);
-
+int vma_expand(struct vma_merge_struct *vmg);
 int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	       unsigned long start, unsigned long end, pgoff_t pgoff);
 
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 7a3f59186464..f6c4706a861f 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -121,8 +121,7 @@ static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
  */
 static int expand_existing(struct vma_merge_struct *vmg)
 {
-	return vma_expand(vmg->vmi, vmg->vma, vmg->start, vmg->end, vmg->pgoff,
-			  vmg->next);
+	return vma_expand(vmg);
 }
 
 /*
-- 
cgit v1.2.3


From cacded5e42b9609b07b22d80c10f0076d439f7d1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 30 Aug 2024 19:10:18 +0100
Subject: mm: avoid using vma_merge() for new VMAs

Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.

Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap.  We add an
assert to this effect.

This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.

In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.

Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().

Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.

Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.

This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.

Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.

Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.

Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.

Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c                        |  92 ++++--------------
 mm/vma.c                         | 200 ++++++++++++++++++++++++++++++++++-----
 mm/vma.h                         |  48 +++++++++-
 tools/testing/vma/vma.c          |  33 ++++++-
 tools/testing/vma/vma_internal.h |   6 ++
 5 files changed, 279 insertions(+), 100 deletions(-)

(limited to 'tools')

diff --git a/mm/mmap.c b/mm/mmap.c
index 2b3006efd3fb..02f7b45c3076 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1364,8 +1364,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
-	struct vm_area_struct *next, *prev, *merge;
 	pgoff_t pglen = PHYS_PFN(len);
+	struct vm_area_struct *merge;
 	unsigned long charged = 0;
 	struct vma_munmap_struct vms;
 	struct ma_state mas_detach;
@@ -1389,14 +1389,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		if (error)
 			goto gather_failed;
 
-		next = vmg.next = vms.next;
-		prev = vmg.prev = vms.prev;
+		vmg.next = vms.next;
+		vmg.prev = vms.prev;
 		vma = NULL;
 	} else {
-		next = vmg.next = vma_next(&vmi);
-		prev = vmg.prev = vma_prev(&vmi);
-		if (prev)
-			vma_iter_next_range(&vmi);
+		vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev);
 	}
 
 	/* Check against address space limit. */
@@ -1417,46 +1414,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		vmg.flags = vm_flags;
 	}
 
-	if (vm_flags & VM_SPECIAL)
-		goto cannot_expand;
-
-	/* Attempt to expand an old mapping */
-	/* Check next */
-	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
-		vmg.end = next->vm_end;
-		vma = vmg.vma = next;
-		vmg.pgoff = next->vm_pgoff - pglen;
-		/*
-		 * We set this here so if we will merge with the previous VMA in
-		 * the code below, can_vma_merge_after() ensures anon_vma
-		 * compatibility between prev and next.
-		 */
-		vmg.anon_vma = vma->anon_vma;
-		vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
-	}
-
-	/* Check prev */
-	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
-		vmg.start = prev->vm_start;
-		vma = vmg.vma = prev;
-		vmg.pgoff = prev->vm_pgoff;
-		vma_prev(&vmi); /* Equivalent to going to the previous range */
-	}
-
-	if (vma) {
-		/* Actually expand, if possible */
-		if (!vma_expand(&vmg)) {
-			khugepaged_enter_vma(vma, vm_flags);
-			goto expanded;
-		}
-
-		/* If the expand fails, then reposition the vma iterator */
-		if (unlikely(vma == prev))
-			vma_iter_set(&vmi, addr);
-	}
-
-cannot_expand:
-
+	vma = vma_merge_new_range(&vmg);
+	if (vma)
+		goto expanded;
 	/*
 	 * Determine the object being mapped and call the appropriate
 	 * specific mapper. the address has already been validated, but
@@ -1503,10 +1463,11 @@ cannot_expand:
 		 * If vm_flags changed after call_mmap(), we should try merge
 		 * vma again as we may succeed this time.
 		 */
-		if (unlikely(vm_flags != vma->vm_flags && prev)) {
-			merge = vma_merge_new_vma(&vmi, prev, vma,
-						  vma->vm_start, vma->vm_end,
-						  vma->vm_pgoff);
+		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
+			vmg.flags = vma->vm_flags;
+			/* If this fails, state is reset ready for a reattempt. */
+			merge = vma_merge_new_range(&vmg);
+
 			if (merge) {
 				/*
 				 * ->mmap() can change vma->vm_file and fput
@@ -1522,6 +1483,7 @@ cannot_expand:
 				vm_flags = vma->vm_flags;
 				goto unmap_writable;
 			}
+			vma_iter_config(&vmi, addr, end);
 		}
 
 		vm_flags = vma->vm_flags;
@@ -1554,7 +1516,7 @@ cannot_expand:
 	vma_link_file(vma);
 
 	/*
-	 * vma_merge() calls khugepaged_enter_vma() either, the below
+	 * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
 	 * call covers the non-merge case.
 	 */
 	khugepaged_enter_vma(vma, vma->vm_flags);
@@ -1609,7 +1571,7 @@ unmap_and_free_vma:
 
 		vma_iter_set(&vmi, vma->vm_end);
 		/* Undo any partial mapping done by a device driver. */
-		unmap_region(&vmi.mas, vma, prev, next);
+		unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
 	}
 	if (writable_file_mapping)
 		mapping_unmap_writable(file->f_mapping);
@@ -1756,7 +1718,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long len, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
-	struct vma_prepare vp;
 
 	/*
 	 * Check against address space limits by the changed size
@@ -1780,25 +1741,12 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
 
 		vmg.prev = vma;
-		if (can_vma_merge_after(&vmg)) {
-			vma_iter_config(vmi, vma->vm_start, addr + len);
-			if (vma_iter_prealloc(vmi, vma))
-				goto unacct_fail;
-
-			vma_start_write(vma);
-
-			init_vma_prep(&vp, vma);
-			vma_prepare(&vp);
-			vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
-			vma->vm_end = addr + len;
-			vm_flags_set(vma, VM_SOFTDIRTY);
-			vma_iter_store(vmi, vma);
-
-			vma_complete(&vp, vmi, mm);
-			validate_mm(mm);
-			khugepaged_enter_vma(vma, flags);
+		vma_iter_next_range(vmi);
+
+		if (vma_merge_new_range(&vmg))
 			goto out;
-		}
+		else if (vmg_nomem(&vmg))
+			goto unacct_fail;
 	}
 
 	if (vma)
diff --git a/mm/vma.c b/mm/vma.c
index d1033dade70e..749c4881fd60 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -55,6 +55,13 @@ static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
 	return anon_vma1 == anon_vma2;
 }
 
+/* Are the anon_vma's belonging to each VMA compatible with one another? */
+static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
+					    struct vm_area_struct *vma2)
+{
+	return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
+}
+
 /*
  * init_multi_vma_prep() - Initializer for struct vma_prepare
  * @vp: The vma_prepare struct
@@ -130,6 +137,44 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
 	return false;
 }
 
+/*
+ * Can the proposed VMA be merged with the left (previous) VMA taking into
+ * account the start position of the proposed range.
+ */
+static bool can_vma_merge_left(struct vma_merge_struct *vmg)
+
+{
+	return vmg->prev && vmg->prev->vm_end == vmg->start &&
+		can_vma_merge_after(vmg);
+}
+
+/*
+ * Can the proposed VMA be merged with the right (next) VMA taking into
+ * account the end position of the proposed range.
+ *
+ * In addition, if we can merge with the left VMA, ensure that left and right
+ * anon_vma's are also compatible.
+ */
+static bool can_vma_merge_right(struct vma_merge_struct *vmg,
+				bool can_merge_left)
+{
+	if (!vmg->next || vmg->end != vmg->next->vm_start ||
+	    !can_vma_merge_before(vmg))
+		return false;
+
+	if (!can_merge_left)
+		return true;
+
+	/*
+	 * If we can merge with prev (left) and next (right), indicating that
+	 * each VMA's anon_vma is compatible with the proposed anon_vma, this
+	 * does not mean prev and next are compatible with EACH OTHER.
+	 *
+	 * We therefore check this in addition to mergeability to either side.
+	 */
+	return are_anon_vmas_compatible(vmg->prev, vmg->next);
+}
+
 /*
  * Close a vm structure and free it.
  */
@@ -464,6 +509,111 @@ void validate_mm(struct mm_struct *mm)
 }
 #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
 
+/*
+ * vma_merge_new_range - Attempt to merge a new VMA into address space
+ *
+ * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
+ *       (exclusive), which we try to merge with any adjacent VMAs if possible.
+ *
+ * We are about to add a VMA to the address space starting at @vmg->start and
+ * ending at @vmg->end. There are three different possible scenarios:
+ *
+ * 1. There is a VMA with identical properties immediately adjacent to the
+ *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
+ *    EXPAND that VMA:
+ *
+ * Proposed:       |-----|  or  |-----|
+ * Existing:  |----|                  |----|
+ *
+ * 2. There are VMAs with identical properties immediately adjacent to the
+ *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
+ *    EXPAND the former and REMOVE the latter:
+ *
+ * Proposed:       |-----|
+ * Existing:  |----|     |----|
+ *
+ * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
+ *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
+ *
+ * In instances where we can merge, this function returns the expanded VMA which
+ * will have its range adjusted accordingly and the underlying maple tree also
+ * adjusted.
+ *
+ * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
+ *          to the VMA we expanded.
+ *
+ * This function adjusts @vmg to provide @vmg->next if not already specified,
+ * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
+ *
+ * ASSUMPTIONS:
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
+     other than VMAs that will be unmapped should the operation succeed.
+ * - The caller must have specified the previous vma in @vmg->prev.
+ * - The caller must have specified the next vma in @vmg->next.
+ * - The caller must have positioned the vmi at or before the gap.
+ */
+struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
+{
+	struct vm_area_struct *prev = vmg->prev;
+	struct vm_area_struct *next = vmg->next;
+	unsigned long start = vmg->start;
+	unsigned long end = vmg->end;
+	pgoff_t pgoff = vmg->pgoff;
+	pgoff_t pglen = PHYS_PFN(end - start);
+	bool can_merge_left, can_merge_right;
+
+	mmap_assert_write_locked(vmg->mm);
+	VM_WARN_ON(vmg->vma);
+	/* vmi must point at or before the gap. */
+	VM_WARN_ON(vma_iter_addr(vmg->vmi) > end);
+
+	vmg->state = VMA_MERGE_NOMERGE;
+
+	/* Special VMAs are unmergeable, also if no prev/next. */
+	if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
+		return NULL;
+
+	can_merge_left = can_vma_merge_left(vmg);
+	can_merge_right = can_vma_merge_right(vmg, can_merge_left);
+
+	/* If we can merge with the next VMA, adjust vmg accordingly. */
+	if (can_merge_right) {
+		vmg->end = next->vm_end;
+		vmg->vma = next;
+		vmg->pgoff = next->vm_pgoff - pglen;
+	}
+
+	/* If we can merge with the previous VMA, adjust vmg accordingly. */
+	if (can_merge_left) {
+		vmg->start = prev->vm_start;
+		vmg->vma = prev;
+		vmg->pgoff = prev->vm_pgoff;
+
+		vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
+	}
+
+	/*
+	 * Now try to expand adjacent VMA(s). This takes care of removing the
+	 * following VMA if we have VMAs on both sides.
+	 */
+	if (vmg->vma && !vma_expand(vmg)) {
+		khugepaged_enter_vma(vmg->vma, vmg->flags);
+		vmg->state = VMA_MERGE_SUCCESS;
+		return vmg->vma;
+	}
+
+	/* If expansion failed, reset state. Allows us to retry merge later. */
+	vmg->vma = NULL;
+	vmg->start = start;
+	vmg->end = end;
+	vmg->pgoff = pgoff;
+	if (vmg->vma == prev)
+		vma_iter_set(vmg->vmi, start);
+
+	return NULL;
+}
+
 /*
  * vma_expand - Expand an existing VMA
  *
@@ -474,7 +624,11 @@ void validate_mm(struct mm_struct *mm)
  * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
  * vmg->next needs to be handled by the caller.
  *
- * Returns: 0 on success
+ * Returns: 0 on success.
+ *
+ * ASSUMPTIONS:
+ * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
+ * - The caller must have set @vmg->vma and @vmg->next.
  */
 int vma_expand(struct vma_merge_struct *vmg)
 {
@@ -484,6 +638,8 @@ int vma_expand(struct vma_merge_struct *vmg)
 	struct vm_area_struct *next = vmg->next;
 	struct vma_prepare vp;
 
+	mmap_assert_write_locked(vmg->mm);
+
 	vma_start_write(vma);
 	if (next && (vma != next) && (vmg->end == next->vm_end)) {
 		int ret;
@@ -516,6 +672,7 @@ int vma_expand(struct vma_merge_struct *vmg)
 	return 0;
 
 nomem:
+	vmg->state = VMA_MERGE_ERROR_NOMEM;
 	if (anon_dup)
 		unlink_anon_vmas(anon_dup);
 	return -ENOMEM;
@@ -1029,6 +1186,8 @@ static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
 	pgoff_t pglen = PHYS_PFN(end - addr);
 	long adj_start = 0;
 
+	vmg->state = VMA_MERGE_NOMERGE;
+
 	/*
 	 * We later require that vma->vm_flags == vm_flags,
 	 * so this tests vma->vm_flags & VM_SPECIAL, too.
@@ -1180,13 +1339,19 @@ static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
 	vma_complete(&vp, vmg->vmi, mm);
 	validate_mm(mm);
 	khugepaged_enter_vma(res, vmg->flags);
+
+	vmg->state = VMA_MERGE_SUCCESS;
 	return res;
 
 prealloc_fail:
+	vmg->state = VMA_MERGE_ERROR_NOMEM;
 	if (anon_dup)
 		unlink_anon_vmas(anon_dup);
 
 anon_vma_fail:
+	if (err == -ENOMEM)
+		vmg->state = VMA_MERGE_ERROR_NOMEM;
+
 	vma_iter_set(vmg->vmi, addr);
 	vma_iter_load(vmg->vmi);
 	return NULL;
@@ -1293,22 +1458,6 @@ struct vm_area_struct
 	return vma_modify(&vmg);
 }
 
-/*
- * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
- * must ensure that [start, end) does not overlap any existing VMA.
- */
-struct vm_area_struct
-*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
-		   struct vm_area_struct *vma, unsigned long start,
-		   unsigned long end, pgoff_t pgoff)
-{
-	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
-
-	vmg.pgoff = pgoff;
-
-	return vma_merge(&vmg);
-}
-
 /*
  * Expand vma by delta bytes, potentially merging with an immediately adjacent
  * VMA with identical properties.
@@ -1319,8 +1468,10 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
 {
 	VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
 
-	/* vma is specified as prev, so case 1 or 2 will apply. */
-	return vma_merge(&vmg);
+	vmg.next = vma_iter_next_rewind(vmi, NULL);
+	vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */
+
+	return vma_merge_new_range(&vmg);
 }
 
 void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
@@ -1421,9 +1572,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	struct vm_area_struct *vma = *vmap;
 	unsigned long vma_start = vma->vm_start;
 	struct mm_struct *mm = vma->vm_mm;
-	struct vm_area_struct *new_vma, *prev;
+	struct vm_area_struct *new_vma;
 	bool faulted_in_anon_vma = true;
 	VMA_ITERATOR(vmi, mm, addr);
+	VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);
 
 	/*
 	 * If anonymous vma has not yet been faulted, update new pgoff
@@ -1434,11 +1586,15 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		faulted_in_anon_vma = false;
 	}
 
-	new_vma = find_vma_prev(mm, addr, &prev);
+	new_vma = find_vma_prev(mm, addr, &vmg.prev);
 	if (new_vma && new_vma->vm_start < addr + len)
 		return NULL;	/* should never get here */
 
-	new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
+	vmg.vma = NULL; /* New VMA range. */
+	vmg.pgoff = pgoff;
+	vmg.next = vma_iter_next_rewind(&vmi, NULL);
+	new_vma = vma_merge_new_range(&vmg);
+
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
diff --git a/mm/vma.h b/mm/vma.h
index 218d884ff5ff..82354fe5edd0 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -52,6 +52,13 @@ struct vma_munmap_struct {
 	unsigned long data_vm;
 };
 
+enum vma_merge_state {
+	VMA_MERGE_START,
+	VMA_MERGE_ERROR_NOMEM,
+	VMA_MERGE_NOMERGE,
+	VMA_MERGE_SUCCESS,
+};
+
 /* Represents a VMA merge operation. */
 struct vma_merge_struct {
 	struct mm_struct *mm;
@@ -68,8 +75,14 @@ struct vma_merge_struct {
 	struct mempolicy *policy;
 	struct vm_userfaultfd_ctx uffd_ctx;
 	struct anon_vma_name *anon_name;
+	enum vma_merge_state state;
 };
 
+static inline bool vmg_nomem(struct vma_merge_struct *vmg)
+{
+	return vmg->state == VMA_MERGE_ERROR_NOMEM;
+}
+
 /* Assumes addr >= vma->vm_start. */
 static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 				       unsigned long addr)
@@ -85,6 +98,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 		.end = end_,						\
 		.flags = flags_,					\
 		.pgoff = pgoff_,					\
+		.state = VMA_MERGE_START,				\
 	}
 
 #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_)	\
@@ -103,6 +117,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 		.policy = vma_policy(vma_),			\
 		.uffd_ctx = vma_->vm_userfaultfd_ctx,		\
 		.anon_name = anon_vma_name(vma_),		\
+		.state = VMA_MERGE_START,			\
 	}
 
 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -309,10 +324,7 @@ struct vm_area_struct
 		       unsigned long new_flags,
 		       struct vm_userfaultfd_ctx new_ctx);
 
-struct vm_area_struct
-*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
-		   struct vm_area_struct *vma, unsigned long start,
-		   unsigned long end, pgoff_t pgoff);
+struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
 
 struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
 					struct vm_area_struct *vma,
@@ -505,6 +517,34 @@ struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
 	return mas_prev_range(&vmi->mas, 0);
 }
 
+/*
+ * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or
+ * if no previous VMA, to index 0.
+ */
+static inline
+struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
+		struct vm_area_struct **pprev)
+{
+	struct vm_area_struct *next = vma_next(vmi);
+	struct vm_area_struct *prev = vma_prev(vmi);
+
+	/*
+	 * Consider the case where no previous VMA exists. We advance to the
+	 * next VMA, skipping any gap, then rewind to the start of the range.
+	 *
+	 * If we were to unconditionally advance to the next range we'd wind up
+	 * at the next VMA again, so we check to ensure there is a previous VMA
+	 * to skip over.
+	 */
+	if (prev)
+		vma_iter_next_range(vmi);
+
+	if (pprev)
+		*pprev = prev;
+
+	return next;
+}
+
 #ifdef CONFIG_64BIT
 
 static inline bool vma_is_sealed(struct vm_area_struct *vma)
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index f6c4706a861f..b7cdafec09af 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -101,9 +101,9 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
 	 */
 	vmg->next = vma_next(vmg->vmi);
 	vmg->prev = vma_prev(vmg->vmi);
+	vma_iter_next_range(vmg->vmi);
 
-	vma_iter_set(vmg->vmi, vmg->start);
-	return vma_merge(vmg);
+	return vma_merge_new_range(vmg);
 }
 
 /*
@@ -162,10 +162,14 @@ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
 	merged = merge_new(vmg);
 	if (merged) {
 		*was_merged = true;
+		ASSERT_EQ(vmg->state, VMA_MERGE_SUCCESS);
 		return merged;
 	}
 
 	*was_merged = false;
+
+	ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE);
+
 	return alloc_and_link_vma(mm, start, end, pgoff, flags);
 }
 
@@ -595,6 +599,7 @@ static bool test_vma_merge_special_flags(void)
 		vmg.flags = flags | special_flag;
 		vma = merge_new(&vmg);
 		ASSERT_EQ(vma, NULL);
+		ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 	}
 
 	/* 2. Modify VMA with special flag that would otherwise merge. */
@@ -616,6 +621,7 @@ static bool test_vma_merge_special_flags(void)
 		vmg.flags = flags | special_flag;
 		vma = merge_existing(&vmg);
 		ASSERT_EQ(vma, NULL);
+		ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 	}
 
 	cleanup_mm(&mm, &vmi);
@@ -708,6 +714,7 @@ static bool test_vma_merge_with_close(void)
 
 	/* The next VMA having a close() operator should cause the merge to fail.*/
 	ASSERT_EQ(merge_new(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	/* Now create the VMA so we can merge via modified flags */
 	vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
@@ -719,6 +726,7 @@ static bool test_vma_merge_with_close(void)
 	 * also fail.
 	 */
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	/* SCENARIO B
 	 *
@@ -744,6 +752,7 @@ static bool test_vma_merge_with_close(void)
 	vmg.vma = vma;
 	/* Make sure merge does not occur. */
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	cleanup_mm(&mm, &vmi);
 	return true;
@@ -792,6 +801,7 @@ static bool test_vma_merge_new_with_close(void)
 	vmg_set_range(&vmg, 0x2000, 0x5000, 2, flags);
 	vma = merge_new(&vmg);
 	ASSERT_NE(vma, NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma->vm_start, 0);
 	ASSERT_EQ(vma->vm_end, 0x5000);
 	ASSERT_EQ(vma->vm_pgoff, 0);
@@ -831,6 +841,7 @@ static bool test_merge_existing(void)
 	vmg.prev = vma;
 	vma->anon_vma = &dummy_anon_vma;
 	ASSERT_EQ(merge_existing(&vmg), vma_next);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma_next->vm_start, 0x3000);
 	ASSERT_EQ(vma_next->vm_end, 0x9000);
 	ASSERT_EQ(vma_next->vm_pgoff, 3);
@@ -861,6 +872,7 @@ static bool test_merge_existing(void)
 	vmg.vma = vma;
 	vma->anon_vma = &dummy_anon_vma;
 	ASSERT_EQ(merge_existing(&vmg), vma_next);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma_next->vm_start, 0x2000);
 	ASSERT_EQ(vma_next->vm_end, 0x9000);
 	ASSERT_EQ(vma_next->vm_pgoff, 2);
@@ -889,6 +901,7 @@ static bool test_merge_existing(void)
 	vma->anon_vma = &dummy_anon_vma;
 
 	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma_prev->vm_start, 0);
 	ASSERT_EQ(vma_prev->vm_end, 0x6000);
 	ASSERT_EQ(vma_prev->vm_pgoff, 0);
@@ -920,6 +933,7 @@ static bool test_merge_existing(void)
 	vmg.vma = vma;
 	vma->anon_vma = &dummy_anon_vma;
 	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma_prev->vm_start, 0);
 	ASSERT_EQ(vma_prev->vm_end, 0x7000);
 	ASSERT_EQ(vma_prev->vm_pgoff, 0);
@@ -948,6 +962,7 @@ static bool test_merge_existing(void)
 	vmg.vma = vma;
 	vma->anon_vma = &dummy_anon_vma;
 	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma_prev->vm_start, 0);
 	ASSERT_EQ(vma_prev->vm_end, 0x9000);
 	ASSERT_EQ(vma_prev->vm_pgoff, 0);
@@ -981,31 +996,37 @@ static bool test_merge_existing(void)
 	vmg.prev = vma;
 	vmg.vma = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags);
 	vmg.prev = vma;
 	vmg.vma = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	vmg_set_range(&vmg, 0x6000, 0x7000, 6, flags);
 	vmg.prev = vma;
 	vmg.vma = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	vmg_set_range(&vmg, 0x4000, 0x7000, 4, flags);
 	vmg.prev = vma;
 	vmg.vma = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	vmg_set_range(&vmg, 0x4000, 0x6000, 4, flags);
 	vmg.prev = vma;
 	vmg.vma = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags);
 	vmg.prev = vma;
 	vmg.vma = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
 	ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
 
@@ -1071,6 +1092,7 @@ static bool test_anon_vma_non_mergeable(void)
 	vmg.vma = vma;
 
 	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma_prev->vm_start, 0);
 	ASSERT_EQ(vma_prev->vm_end, 0x7000);
 	ASSERT_EQ(vma_prev->vm_pgoff, 0);
@@ -1106,6 +1128,7 @@ static bool test_anon_vma_non_mergeable(void)
 	vmg.prev = vma_prev;
 
 	ASSERT_EQ(merge_new(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma_prev->vm_start, 0);
 	ASSERT_EQ(vma_prev->vm_end, 0x7000);
 	ASSERT_EQ(vma_prev->vm_pgoff, 0);
@@ -1181,6 +1204,7 @@ static bool test_dup_anon_vma(void)
 	vmg.vma = vma;
 
 	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 
 	ASSERT_EQ(vma_prev->vm_start, 0);
 	ASSERT_EQ(vma_prev->vm_end, 0x8000);
@@ -1209,6 +1233,7 @@ static bool test_dup_anon_vma(void)
 	vmg.vma = vma;
 
 	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 
 	ASSERT_EQ(vma_prev->vm_start, 0);
 	ASSERT_EQ(vma_prev->vm_end, 0x8000);
@@ -1236,6 +1261,7 @@ static bool test_dup_anon_vma(void)
 	vmg.vma = vma;
 
 	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 
 	ASSERT_EQ(vma_prev->vm_start, 0);
 	ASSERT_EQ(vma_prev->vm_end, 0x5000);
@@ -1263,6 +1289,7 @@ static bool test_dup_anon_vma(void)
 	vmg.vma = vma;
 
 	ASSERT_EQ(merge_existing(&vmg), vma_next);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 
 	ASSERT_EQ(vma_next->vm_start, 0x3000);
 	ASSERT_EQ(vma_next->vm_end, 0x8000);
@@ -1303,6 +1330,7 @@ static bool test_vmi_prealloc_fail(void)
 
 	/* This will cause the merge to fail. */
 	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
 	/* We will already have assigned the anon_vma. */
 	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
 	/* And it was both cloned and unlinked. */
@@ -1327,6 +1355,7 @@ static bool test_vmi_prealloc_fail(void)
 
 	fail_prealloc = true;
 	ASSERT_EQ(expand_existing(&vmg), -ENOMEM);
+	ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
 
 	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
 	ASSERT_TRUE(dummy_anon_vma.was_cloned);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index a3c262c6eb73..c5b9da034511 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -740,6 +740,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
 	mas_destroy(&vmi->mas);
 }
 
+static inline
+struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
+{
+	return mas_next_range(&vmi->mas, ULONG_MAX);
+}
+
 static inline void vm_acct_memory(long pages)
 {
 }
-- 
cgit v1.2.3


From cc8cb3697a8d8eabe1fb9acb8768b11c1ab607d8 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 30 Aug 2024 19:10:21 +0100
Subject: mm: refactor vma_merge() into modify-only vma_merge_existing_range()

The existing vma_merge() function is no longer required to handle what
were previously referred to as cases 1-3 (i.e.  the merging of a new VMA),
as this is now handled by vma_merge_new_vma().

Additionally, simplify the convoluted control flow of the original,
maintaining identical logic only expressed more clearly and doing away
with a complicated set of cases, rather logically examining each possible
outcome - merging of both the previous and subsequent VMA, merging of the
previous VMA and merging of the subsequent VMA alone.

We now utilise the previously implemented commit_merge() function to share
logic with vma_expand() de-duplicating code and providing less surface
area for bugs and confusion.  In order to do so, we adjust this function
to accept parameters specific to merging existing ranges.

Link: https://lkml.kernel.org/r/2cf6016b7bfcc4965fc3cde10827560c42e4f12c.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vma.c                | 508 ++++++++++++++++++++++++------------------------
 tools/testing/vma/vma.c |   9 +-
 2 files changed, 264 insertions(+), 253 deletions(-)

(limited to 'tools')

diff --git a/mm/vma.c b/mm/vma.c
index 566cad2338dd..393bef832604 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -587,29 +587,278 @@ void validate_mm(struct mm_struct *mm)
 
 /* Actually perform the VMA merge operation. */
 static int commit_merge(struct vma_merge_struct *vmg,
-			struct vm_area_struct *remove)
+			struct vm_area_struct *adjust,
+			struct vm_area_struct *remove,
+			struct vm_area_struct *remove2,
+			long adj_start,
+			bool expanded)
 {
 	struct vma_prepare vp;
 
-	init_multi_vma_prep(&vp, vmg->vma, NULL, remove, NULL);
+	init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
 
-	/* Note: vma iterator must be pointing to 'start'. */
-	vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+	VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
+		   vp.anon_vma != adjust->anon_vma);
+
+	if (expanded) {
+		/* Note: vma iterator must be pointing to 'start'. */
+		vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+	} else {
+		vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
+				adjust->vm_end);
+	}
 
 	if (vma_iter_prealloc(vmg->vmi, vmg->vma))
 		return -ENOMEM;
 
 	vma_prepare(&vp);
-	vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, 0);
+	vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
 	vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
 
-	vma_iter_store(vmg->vmi, vmg->vma);
+	if (expanded)
+		vma_iter_store(vmg->vmi, vmg->vma);
+
+	if (adj_start) {
+		adjust->vm_start += adj_start;
+		adjust->vm_pgoff += PHYS_PFN(adj_start);
+		if (adj_start < 0) {
+			WARN_ON(expanded);
+			vma_iter_store(vmg->vmi, adjust);
+		}
+	}
 
 	vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
 
 	return 0;
 }
 
+/*
+ * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
+ * attributes modified.
+ *
+ * @vmg: Describes the modifications being made to a VMA and associated
+ *       metadata.
+ *
+ * When the attributes of a range within a VMA change, then it might be possible
+ * for immediately adjacent VMAs to be merged into that VMA due to having
+ * identical properties.
+ *
+ * This function checks for the existence of any such mergeable VMAs and updates
+ * the maple tree describing the @vmg->vma->vm_mm address space to account for
+ * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
+ *
+ * As part of this operation, if a merge occurs, the @vmg object will have its
+ * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
+ * calls to this function should reset these fields.
+ *
+ * Returns: The merged VMA if merge succeeds, or NULL otherwise.
+ *
+ * ASSUMPTIONS:
+ * - The caller must assign the VMA to be modifed to @vmg->vma.
+ * - The caller must have set @vmg->prev to the previous VMA, if there is one.
+ * - The caller must not set @vmg->next, as we determine this.
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
+ */
+static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
+{
+	struct vm_area_struct *vma = vmg->vma;
+	struct vm_area_struct *prev = vmg->prev;
+	struct vm_area_struct *next, *res;
+	struct vm_area_struct *anon_dup = NULL;
+	struct vm_area_struct *adjust = NULL;
+	unsigned long start = vmg->start;
+	unsigned long end = vmg->end;
+	bool left_side = vma && start == vma->vm_start;
+	bool right_side = vma && end == vma->vm_end;
+	int err = 0;
+	long adj_start = 0;
+	bool merge_will_delete_vma, merge_will_delete_next;
+	bool merge_left, merge_right, merge_both;
+	bool expanded;
+
+	mmap_assert_write_locked(vmg->mm);
+	VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
+	VM_WARN_ON(vmg->next); /* We set this. */
+	VM_WARN_ON(prev && start <= prev->vm_start);
+	VM_WARN_ON(start >= end);
+	/*
+	 * If vma == prev, then we are offset into a VMA. Otherwise, if we are
+	 * not, we must span a portion of the VMA.
+	 */
+	VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
+			   vmg->end > vma->vm_end));
+	/* The vmi must be positioned within vmg->vma. */
+	VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
+			    vma_iter_addr(vmg->vmi) < vma->vm_end));
+
+	vmg->state = VMA_MERGE_NOMERGE;
+
+	/*
+	 * If a special mapping or if the range being modified is neither at the
+	 * furthermost left or right side of the VMA, then we have no chance of
+	 * merging and should abort.
+	 */
+	if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+		return NULL;
+
+	if (left_side)
+		merge_left = can_vma_merge_left(vmg);
+	else
+		merge_left = false;
+
+	if (right_side) {
+		next = vmg->next = vma_iter_next_range(vmg->vmi);
+		vma_iter_prev_range(vmg->vmi);
+
+		merge_right = can_vma_merge_right(vmg, merge_left);
+	} else {
+		merge_right = false;
+		next = NULL;
+	}
+
+	if (merge_left)		/* If merging prev, position iterator there. */
+		vma_prev(vmg->vmi);
+	else if (!merge_right)	/* If we have nothing to merge, abort. */
+		return NULL;
+
+	merge_both = merge_left && merge_right;
+	/* If we span the entire VMA, a merge implies it will be deleted. */
+	merge_will_delete_vma = left_side && right_side;
+	/*
+	 * If we merge both VMAs, then next is also deleted. This implies
+	 * merge_will_delete_vma also.
+	 */
+	merge_will_delete_next = merge_both;
+
+	/* No matter what happens, we will be adjusting vma. */
+	vma_start_write(vma);
+
+	if (merge_left)
+		vma_start_write(prev);
+
+	if (merge_right)
+		vma_start_write(next);
+
+	if (merge_both) {
+		/*
+		 *         |<----->|
+		 * |-------*********-------|
+		 *   prev     vma     next
+		 *  extend   delete  delete
+		 */
+
+		vmg->vma = prev;
+		vmg->start = prev->vm_start;
+		vmg->end = next->vm_end;
+		vmg->pgoff = prev->vm_pgoff;
+
+		/*
+		 * We already ensured anon_vma compatibility above, so now it's
+		 * simply a case of, if prev has no anon_vma object, which of
+		 * next or vma contains the anon_vma we must duplicate.
+		 */
+		err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
+	} else if (merge_left) {
+		/*
+		 *         |<----->| OR
+		 *         |<--------->|
+		 * |-------*************
+		 *   prev       vma
+		 *  extend shrink/delete
+		 */
+
+		vmg->vma = prev;
+		vmg->start = prev->vm_start;
+		vmg->pgoff = prev->vm_pgoff;
+
+		if (merge_will_delete_vma) {
+			/*
+			 * can_vma_merge_after() assumed we would not be
+			 * removing vma, so it skipped the check for
+			 * vm_ops->close, but we are removing vma.
+			 */
+			if (vma->vm_ops && vma->vm_ops->close)
+				err = -EINVAL;
+		} else {
+			adjust = vma;
+			adj_start = vmg->end - vma->vm_start;
+		}
+
+		if (!err)
+			err = dup_anon_vma(prev, vma, &anon_dup);
+	} else { /* merge_right */
+		/*
+		 *     |<----->| OR
+		 * |<--------->|
+		 * *************-------|
+		 *      vma       next
+		 * shrink/delete extend
+		 */
+
+		pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
+
+		VM_WARN_ON(!merge_right);
+		/* If we are offset into a VMA, then prev must be vma. */
+		VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
+
+		if (merge_will_delete_vma) {
+			vmg->vma = next;
+			vmg->end = next->vm_end;
+			vmg->pgoff = next->vm_pgoff - pglen;
+		} else {
+			/*
+			 * We shrink vma and expand next.
+			 *
+			 * IMPORTANT: This is the ONLY case where the final
+			 * merged VMA is NOT vmg->vma, but rather vmg->next.
+			 */
+
+			vmg->start = vma->vm_start;
+			vmg->end = start;
+			vmg->pgoff = vma->vm_pgoff;
+
+			adjust = next;
+			adj_start = -(vma->vm_end - start);
+		}
+
+		err = dup_anon_vma(next, vma, &anon_dup);
+	}
+
+	if (err)
+		goto abort;
+
+	/*
+	 * In nearly all cases, we expand vmg->vma. There is one exception -
+	 * merge_right where we partially span the VMA. In this case we shrink
+	 * the end of vmg->vma and adjust the start of vmg->next accordingly.
+	 */
+	expanded = !merge_right || merge_will_delete_vma;
+
+	if (commit_merge(vmg, adjust,
+			 merge_will_delete_vma ? vma : NULL,
+			 merge_will_delete_next ? next : NULL,
+			 adj_start, expanded)) {
+		if (anon_dup)
+			unlink_anon_vmas(anon_dup);
+
+		vmg->state = VMA_MERGE_ERROR_NOMEM;
+		return NULL;
+	}
+
+	res = merge_left ? prev : next;
+	khugepaged_enter_vma(res, vmg->flags);
+
+	vmg->state = VMA_MERGE_SUCCESS;
+	return res;
+
+abort:
+	vma_iter_set(vmg->vmi, start);
+	vma_iter_load(vmg->vmi);
+	vmg->state = VMA_MERGE_ERROR_NOMEM;
+	return NULL;
+}
+
 /*
  * vma_merge_new_range - Attempt to merge a new VMA into address space
  *
@@ -757,7 +1006,7 @@ int vma_expand(struct vma_merge_struct *vmg)
 	/* Only handles expanding */
 	VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
 
-	if (commit_merge(vmg, remove_next ? next : NULL))
+	if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
 		goto nomem;
 
 	return 0;
@@ -1127,249 +1376,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
 }
 
-/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
- * figure out whether that can be merged with its predecessor or its
- * successor.  Or both (it neatly fills a hole).
- *
- * In most cases - when called for mmap, brk or mremap - [addr,end) is
- * certain not to be mapped by the time vma_merge is called; but when
- * called for mprotect, it is certain to be already mapped (either at
- * an offset within prev, or at the start of next), and the flags of
- * this area are about to be changed to vm_flags - and the no-change
- * case has already been eliminated.
- *
- * The following mprotect cases have to be considered, where **** is
- * the area passed down from mprotect_fixup, never extending beyond one
- * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
- * at the same address as **** and is of the same or larger span, and
- * NNNN the next vma after ****:
- *
- *     ****             ****                   ****
- *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPCCCCCC
- *    cannot merge    might become       might become
- *                    PPNNNNNNNNNN       PPPPPPPPPPCC
- *    mmap, brk or    case 4 below       case 5 below
- *    mremap move:
- *                        ****               ****
- *                    PPPP    NNNN       PPPPCCCCNNNN
- *                    might become       might become
- *                    PPPPPPPPPPPP 1 or  PPPPPPPPPPPP 6 or
- *                    PPPPPPPPNNNN 2 or  PPPPPPPPNNNN 7 or
- *                    PPPPNNNNNNNN 3     PPPPNNNNNNNN 8
- *
- * It is important for case 8 that the vma CCCC overlapping the
- * region **** is never going to extended over NNNN. Instead NNNN must
- * be extended in region **** and CCCC must be removed. This way in
- * all cases where vma_merge succeeds, the moment vma_merge drops the
- * rmap_locks, the properties of the merged vma will be already
- * correct for the whole merged range. Some of those properties like
- * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
- * be correct for the whole merged range immediately after the
- * rmap_locks are released. Otherwise if NNNN would be removed and
- * CCCC would be extended over the NNNN range, remove_migration_ptes
- * or other rmap walkers (if working on addresses beyond the "end"
- * parameter) may establish ptes with the wrong permissions of CCCC
- * instead of the right permissions of NNNN.
- *
- * In the code below:
- * PPPP is represented by *prev
- * CCCC is represented by *curr or not represented at all (NULL)
- * NNNN is represented by *next or not represented at all (NULL)
- * **** is not represented - it will be merged and the vma containing the
- *      area is returned, or the function will return NULL
- */
-static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
-{
-	struct mm_struct *mm = vmg->mm;
-	struct vm_area_struct *prev = vmg->prev;
-	struct vm_area_struct *curr, *next, *res;
-	struct vm_area_struct *vma, *adjust, *remove, *remove2;
-	struct vm_area_struct *anon_dup = NULL;
-	struct vma_prepare vp;
-	pgoff_t vma_pgoff;
-	int err = 0;
-	bool merge_prev = false;
-	bool merge_next = false;
-	bool vma_expanded = false;
-	unsigned long addr = vmg->start;
-	unsigned long end = vmg->end;
-	unsigned long vma_start = addr;
-	unsigned long vma_end = end;
-	pgoff_t pglen = PHYS_PFN(end - addr);
-	long adj_start = 0;
-
-	vmg->state = VMA_MERGE_NOMERGE;
-
-	/*
-	 * We later require that vma->vm_flags == vm_flags,
-	 * so this tests vma->vm_flags & VM_SPECIAL, too.
-	 */
-	if (vmg->flags & VM_SPECIAL)
-		return NULL;
-
-	/* Does the input range span an existing VMA? (cases 5 - 8) */
-	curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
-
-	if (!curr ||			/* cases 1 - 4 */
-	    end == curr->vm_end)	/* cases 6 - 8, adjacent VMA */
-		next = vmg->next = vma_lookup(mm, end);
-	else
-		next = vmg->next = NULL;	/* case 5 */
-
-	if (prev) {
-		vma_start = prev->vm_start;
-		vma_pgoff = prev->vm_pgoff;
-
-		/* Can we merge the predecessor? */
-		if (addr == prev->vm_end && can_vma_merge_after(vmg)) {
-			merge_prev = true;
-			vma_prev(vmg->vmi);
-		}
-	}
-
-	/* Can we merge the successor? */
-	if (next && can_vma_merge_before(vmg)) {
-		merge_next = true;
-	}
-
-	/* Verify some invariant that must be enforced by the caller. */
-	VM_WARN_ON(prev && addr <= prev->vm_start);
-	VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
-	VM_WARN_ON(addr >= end);
-
-	if (!merge_prev && !merge_next)
-		return NULL; /* Not mergeable. */
-
-	if (merge_prev)
-		vma_start_write(prev);
-
-	res = vma = prev;
-	remove = remove2 = adjust = NULL;
-
-	/* Can we merge both the predecessor and the successor? */
-	if (merge_prev && merge_next &&
-	    is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
-		vma_start_write(next);
-		remove = next;				/* case 1 */
-		vma_end = next->vm_end;
-		err = dup_anon_vma(prev, next, &anon_dup);
-		if (curr) {				/* case 6 */
-			vma_start_write(curr);
-			remove = curr;
-			remove2 = next;
-			/*
-			 * Note that the dup_anon_vma below cannot overwrite err
-			 * since the first caller would do nothing unless next
-			 * has an anon_vma.
-			 */
-			if (!next->anon_vma)
-				err = dup_anon_vma(prev, curr, &anon_dup);
-		}
-	} else if (merge_prev) {			/* case 2 */
-		if (curr) {
-			vma_start_write(curr);
-			if (end == curr->vm_end) {	/* case 7 */
-				/*
-				 * can_vma_merge_after() assumed we would not be
-				 * removing prev vma, so it skipped the check
-				 * for vm_ops->close, but we are removing curr
-				 */
-				if (curr->vm_ops && curr->vm_ops->close)
-					err = -EINVAL;
-				remove = curr;
-			} else {			/* case 5 */
-				adjust = curr;
-				adj_start = (end - curr->vm_start);
-			}
-			if (!err)
-				err = dup_anon_vma(prev, curr, &anon_dup);
-		}
-	} else { /* merge_next */
-		vma_start_write(next);
-		res = next;
-		if (prev && addr < prev->vm_end) {	/* case 4 */
-			vma_start_write(prev);
-			vma_end = addr;
-			adjust = next;
-			adj_start = -(prev->vm_end - addr);
-			err = dup_anon_vma(next, prev, &anon_dup);
-		} else {
-			/*
-			 * Note that cases 3 and 8 are the ONLY ones where prev
-			 * is permitted to be (but is not necessarily) NULL.
-			 */
-			vma = next;			/* case 3 */
-			vma_start = addr;
-			vma_end = next->vm_end;
-			vma_pgoff = next->vm_pgoff - pglen;
-			if (curr) {			/* case 8 */
-				vma_pgoff = curr->vm_pgoff;
-				vma_start_write(curr);
-				remove = curr;
-				err = dup_anon_vma(next, curr, &anon_dup);
-			}
-		}
-	}
-
-	/* Error in anon_vma clone. */
-	if (err)
-		goto anon_vma_fail;
-
-	if (vma_start < vma->vm_start || vma_end > vma->vm_end)
-		vma_expanded = true;
-
-	if (vma_expanded) {
-		vma_iter_config(vmg->vmi, vma_start, vma_end);
-	} else {
-		vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
-				adjust->vm_end);
-	}
-
-	if (vma_iter_prealloc(vmg->vmi, vma))
-		goto prealloc_fail;
-
-	init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
-	VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
-		   vp.anon_vma != adjust->anon_vma);
-
-	vma_prepare(&vp);
-	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
-	vma_set_range(vma, vma_start, vma_end, vma_pgoff);
-
-	if (vma_expanded)
-		vma_iter_store(vmg->vmi, vma);
-
-	if (adj_start) {
-		adjust->vm_start += adj_start;
-		adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
-		if (adj_start < 0) {
-			WARN_ON(vma_expanded);
-			vma_iter_store(vmg->vmi, next);
-		}
-	}
-
-	vma_complete(&vp, vmg->vmi, mm);
-	validate_mm(mm);
-	khugepaged_enter_vma(res, vmg->flags);
-
-	vmg->state = VMA_MERGE_SUCCESS;
-	return res;
-
-prealloc_fail:
-	vmg->state = VMA_MERGE_ERROR_NOMEM;
-	if (anon_dup)
-		unlink_anon_vmas(anon_dup);
-
-anon_vma_fail:
-	if (err == -ENOMEM)
-		vmg->state = VMA_MERGE_ERROR_NOMEM;
-
-	vma_iter_set(vmg->vmi, addr);
-	vma_iter_load(vmg->vmi);
-	return NULL;
-}
-
 /*
  * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
  * context and anonymous VMA name within the range [start, end).
@@ -1389,7 +1395,7 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
 	struct vm_area_struct *merged;
 
 	/* First, try to merge. */
-	merged = vma_merge(vmg);
+	merged = vma_merge_existing_range(vmg);
 	if (merged)
 		return merged;
 
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index b7cdafec09af..25a95d9901ea 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -112,7 +112,7 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
  */
 static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
 {
-	return vma_merge(vmg);
+	return vma_merge_existing_range(vmg);
 }
 
 /*
@@ -752,7 +752,12 @@ static bool test_vma_merge_with_close(void)
 	vmg.vma = vma;
 	/* Make sure merge does not occur. */
 	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+	/*
+	 * Initially this is misapprehended as an out of memory report, as the
+	 * close() check is handled in the same way as anon_vma duplication
+	 * failures, however a subsequent patch resolves this.
+	 */
+	ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
 
 	cleanup_mm(&mm, &vmi);
 	return true;
-- 
cgit v1.2.3


From 01c373e9a5ce2273812eaf83036c5357829fb3f7 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 30 Aug 2024 19:10:22 +0100
Subject: mm: rework vm_ops->close() handling on VMA merge

In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.

This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.

In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.

The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.

Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).

In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.

And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.

As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.

In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.

We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.

We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.

Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vma.c                |  57 +++++++++++------
 tools/testing/vma/vma.c | 166 +++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 164 insertions(+), 59 deletions(-)

(limited to 'tools')

diff --git a/mm/vma.c b/mm/vma.c
index 393bef832604..8d1686fc8d5a 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -10,14 +10,6 @@
 static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
 {
 	struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
-	/*
-	 * If the vma has a ->close operation then the driver probably needs to
-	 * release per-vma resources, so we don't attempt to merge those if the
-	 * caller indicates the current vma may be removed as part of the merge,
-	 * which is the case if we are attempting to merge the next VMA into
-	 * this one.
-	 */
-	bool may_remove_vma = merge_next;
 
 	if (!mpol_equal(vmg->policy, vma_policy(vma)))
 		return false;
@@ -33,8 +25,6 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
 		return false;
 	if (vma->vm_file != vmg->file)
 		return false;
-	if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
-		return false;
 	if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
 		return false;
 	if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
@@ -632,6 +622,12 @@ static int commit_merge(struct vma_merge_struct *vmg,
 	return 0;
 }
 
+/* We can only remove VMAs when merging if they do not have a close hook. */
+static bool can_merge_remove_vma(struct vm_area_struct *vma)
+{
+	return !vma->vm_ops || !vma->vm_ops->close;
+}
+
 /*
  * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
  * attributes modified.
@@ -725,12 +721,30 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
 	merge_both = merge_left && merge_right;
 	/* If we span the entire VMA, a merge implies it will be deleted. */
 	merge_will_delete_vma = left_side && right_side;
+
+	/*
+	 * If we need to remove vma in its entirety but are unable to do so,
+	 * we have no sensible recourse but to abort the merge.
+	 */
+	if (merge_will_delete_vma && !can_merge_remove_vma(vma))
+		return NULL;
+
 	/*
 	 * If we merge both VMAs, then next is also deleted. This implies
 	 * merge_will_delete_vma also.
 	 */
 	merge_will_delete_next = merge_both;
 
+	/*
+	 * If we cannot delete next, then we can reduce the operation to merging
+	 * prev and vma (thereby deleting vma).
+	 */
+	if (merge_will_delete_next && !can_merge_remove_vma(next)) {
+		merge_will_delete_next = false;
+		merge_right = false;
+		merge_both = false;
+	}
+
 	/* No matter what happens, we will be adjusting vma. */
 	vma_start_write(vma);
 
@@ -772,21 +786,12 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
 		vmg->start = prev->vm_start;
 		vmg->pgoff = prev->vm_pgoff;
 
-		if (merge_will_delete_vma) {
-			/*
-			 * can_vma_merge_after() assumed we would not be
-			 * removing vma, so it skipped the check for
-			 * vm_ops->close, but we are removing vma.
-			 */
-			if (vma->vm_ops && vma->vm_ops->close)
-				err = -EINVAL;
-		} else {
+		if (!merge_will_delete_vma) {
 			adjust = vma;
 			adj_start = vmg->end - vma->vm_start;
 		}
 
-		if (!err)
-			err = dup_anon_vma(prev, vma, &anon_dup);
+		err = dup_anon_vma(prev, vma, &anon_dup);
 	} else { /* merge_right */
 		/*
 		 *     |<----->| OR
@@ -940,6 +945,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 		vmg->vma = prev;
 		vmg->pgoff = prev->vm_pgoff;
 
+		/*
+		 * If this merge would result in removal of the next VMA but we
+		 * are not permitted to do so, reduce the operation to merging
+		 * prev and vma.
+		 */
+		if (can_merge_right && !can_merge_remove_vma(next))
+			vmg->end = end;
+
 		vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
 	}
 
@@ -994,6 +1007,8 @@ int vma_expand(struct vma_merge_struct *vmg)
 		int ret;
 
 		remove_next = true;
+		/* This should already have been checked by this point. */
+		VM_WARN_ON(!can_merge_remove_vma(next));
 		vma_start_write(next);
 		ret = dup_anon_vma(vma, next, &anon_dup);
 		if (ret)
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 25a95d9901ea..c53f220eb6cc 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -387,6 +387,9 @@ static bool test_merge_new(void)
 	struct anon_vma_chain dummy_anon_vma_chain_d = {
 		.anon_vma = &dummy_anon_vma,
 	};
+	const struct vm_operations_struct vm_ops = {
+		.close = dummy_close,
+	};
 	int count;
 	struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
 	bool merged;
@@ -430,6 +433,7 @@ static bool test_merge_new(void)
 	 * 0123456789abc
 	 * AA*B   DD  CC
 	 */
+	vma_a->vm_ops = &vm_ops; /* This should have no impact. */
 	vma_b->anon_vma = &dummy_anon_vma;
 	vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged);
 	ASSERT_EQ(vma, vma_a);
@@ -466,6 +470,7 @@ static bool test_merge_new(void)
 	 * AAAAA *DD  CC
 	 */
 	vma_d->anon_vma = &dummy_anon_vma;
+	vma_d->vm_ops = &vm_ops; /* This should have no impact. */
 	vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged);
 	ASSERT_EQ(vma, vma_d);
 	/* Prepend. */
@@ -483,6 +488,7 @@ static bool test_merge_new(void)
 	 * 0123456789abc
 	 * AAAAA*DDD  CC
 	 */
+	vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */
 	vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged);
 	ASSERT_EQ(vma, vma_a);
 	/* Merge with A, delete D. */
@@ -640,13 +646,11 @@ static bool test_vma_merge_with_close(void)
 	const struct vm_operations_struct vm_ops = {
 		.close = dummy_close,
 	};
-	struct vm_area_struct *vma_next =
-		alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags);
-	struct vm_area_struct *vma;
+	struct vm_area_struct *vma_prev, *vma_next, *vma;
 
 	/*
-	 * When we merge VMAs we sometimes have to delete others as part of the
-	 * operation.
+	 * When merging VMAs we are not permitted to remove any VMA that has a
+	 * vm_ops->close() hook.
 	 *
 	 * Considering the two possible adjacent VMAs to which a VMA can be
 	 * merged:
@@ -697,28 +701,52 @@ static bool test_vma_merge_with_close(void)
 	 * would be set too, and thus scenario A would pick this up.
 	 */
 
-	ASSERT_NE(vma_next, NULL);
-
 	/*
-	 * SCENARIO A
+	 * The only case of a new VMA merge that results in a VMA being deleted
+	 * is one where both the previous and next VMAs are merged - in this
+	 * instance the next VMA is deleted, and the previous VMA is extended.
 	 *
-	 * 0123
-	 *  *N
+	 * If we are unable to do so, we reduce the operation to simply
+	 * extending the prev VMA and not merging next.
+	 *
+	 * 0123456789
+	 * PPP**NNNN
+	 *             ->
+	 * 0123456789
+	 * PPPPPPNNN
 	 */
 
-	/* Make the next VMA have a close() callback. */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
 	vma_next->vm_ops = &vm_ops;
 
-	/* Our proposed VMA has characteristics that would otherwise be merged. */
-	vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	ASSERT_EQ(merge_new(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x5000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
 
-	/* The next VMA having a close() operator should cause the merge to fail.*/
-	ASSERT_EQ(merge_new(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
 
-	/* Now create the VMA so we can merge via modified flags */
-	vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
-	vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags);
+	/*
+	 * When modifying an existing VMA there are further cases where we
+	 * delete VMAs.
+	 *
+	 *    <>
+	 * 0123456789
+	 * PPPVV
+	 *
+	 * In this instance, if vma has a close hook, the merge simply cannot
+	 * proceed.
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma_prev;
 	vmg.vma = vma;
 
 	/*
@@ -728,38 +756,90 @@ static bool test_vma_merge_with_close(void)
 	ASSERT_EQ(merge_existing(&vmg), NULL);
 	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
-	/* SCENARIO B
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * This case is mirrored if merging with next.
 	 *
-	 * 0123
-	 * P*
+	 *    <>
+	 * 0123456789
+	 *    VVNNNN
 	 *
-	 * In order for this scenario to trigger, the VMA currently being
-	 * modified must also have a .close().
+	 * In this instance, if vma has a close hook, the merge simply cannot
+	 * proceed.
 	 */
 
-	/* Reset VMG state. */
-	vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
-	/*
-	 * Make next unmergeable, and don't let the scenario A check pick this
-	 * up, we want to reproduce scenario B only.
-	 */
-	vma_next->vm_ops = NULL;
-	vma_next->__vm_flags &= ~VM_MAYWRITE;
-	/* Allocate prev. */
-	vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags);
-	/* Assign a vm_ops->close() function to VMA explicitly. */
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
 	vma->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
 	vmg.vma = vma;
-	/* Make sure merge does not occur. */
 	ASSERT_EQ(merge_existing(&vmg), NULL);
 	/*
 	 * Initially this is misapprehended as an out of memory report, as the
 	 * close() check is handled in the same way as anon_vma duplication
 	 * failures, however a subsequent patch resolves this.
 	 */
-	ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * Finally, we consider two variants of the case where we modify a VMA
+	 * to merge with both the previous and next VMAs.
+	 *
+	 * The first variant is where vma has a close hook. In this instance, no
+	 * merge can proceed.
+	 *
+	 *    <>
+	 * 0123456789
+	 * PPPVVNNNN
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+	vma->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
+
+	/*
+	 * The second variant is where next has a close hook. In this instance,
+	 * we reduce the operation to a merge between prev and vma.
+	 *
+	 *    <>
+	 * 0123456789
+	 * PPPVVNNNN
+	 *            ->
+	 * 0123456789
+	 * PPPPPNNNN
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+	vma_next->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.vma = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x5000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
 
-	cleanup_mm(&mm, &vmi);
 	return true;
 }
 
@@ -828,6 +908,9 @@ static bool test_merge_existing(void)
 		.mm = &mm,
 		.vmi = &vmi,
 	};
+	const struct vm_operations_struct vm_ops = {
+		.close = dummy_close,
+	};
 
 	/*
 	 * Merge right case - partial span.
@@ -840,7 +923,9 @@ static bool test_merge_existing(void)
 	 *   VNNNNNN
 	 */
 	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
+	vma->vm_ops = &vm_ops; /* This should have no impact. */
 	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+	vma_next->vm_ops = &vm_ops; /* This should have no impact. */
 	vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
 	vmg.vma = vma;
 	vmg.prev = vma;
@@ -873,6 +958,7 @@ static bool test_merge_existing(void)
 	 */
 	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
 	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+	vma_next->vm_ops = &vm_ops; /* This should have no impact. */
 	vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags);
 	vmg.vma = vma;
 	vma->anon_vma = &dummy_anon_vma;
@@ -899,7 +985,9 @@ static bool test_merge_existing(void)
 	 * PPPPPPV
 	 */
 	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
 	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+	vma->vm_ops = &vm_ops; /* This should have no impact. */
 	vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
 	vmg.prev = vma_prev;
 	vmg.vma = vma;
@@ -932,6 +1020,7 @@ static bool test_merge_existing(void)
 	 * PPPPPPP
 	 */
 	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
 	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
 	vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
 	vmg.prev = vma_prev;
@@ -960,6 +1049,7 @@ static bool test_merge_existing(void)
 	 * PPPPPPPPPP
 	 */
 	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
 	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
 	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
 	vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
-- 
cgit v1.2.3


From 9cb75552f421c5e9667bc19fb430063cb023c219 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 26 Aug 2024 20:03:28 -0700
Subject: selftests/damon: add access_memory_even to .gitignore

Patch series "misc fixups for DAMON {self,kunit} tests".

This patchset is for minor fixups of DAMON selftests and kunit tests.
First three patches make DAMON selftests more cleanly maintained (patches
1 and 2) without unnecessary warnings (patch 3).  Following six patches
remove unnecessary test case (patch 4), handle configs combinations that
can make tests fail (patches 5-7), reorganize the test files following the
new guideline (patch 8), and add reference kunitconfig for DAMON kunit
tests (patch 9).


This patch (of 9):

DAMON selftests build access_memory_even, but its not on the .gitignore
list.  Add it to make 'git status' output cleaner.

Link: https://lkml.kernel.org/r/20240827030336.7930-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240827030336.7930-2-sj@kernel.org
Fixes: c94df805c774 ("selftests/damon: implement a program for even-numbered memory regions access")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore
index e65ef9d9cedc..2ab675fecb6b 100644
--- a/tools/testing/selftests/damon/.gitignore
+++ b/tools/testing/selftests/damon/.gitignore
@@ -3,3 +3,4 @@ huge_count_read_write
 debugfs_target_ids_read_before_terminate_race
 debugfs_target_ids_pid_leak
 access_memory
+access_memory_even
-- 
cgit v1.2.3


From 582c04b07fa91665b4f38682c290fb9f0b085c7e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 26 Aug 2024 20:03:29 -0700
Subject: selftests/damon: cleanup __pycache__/ with 'make clean'

Python-based tests creates __pycache__/ directory.  Remove it with 'make
clean' by defining it as EXTRA_CLEAN.

Link: https://lkml.kernel.org/r/20240827030336.7930-3-sj@kernel.org
Fixes: b5906f5f7359 ("selftests/damon: add a test for update_schemes_tried_regions sysfs command")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 1e2e98cc809d..5b2a6a5dd1af 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -25,4 +25,6 @@ TEST_PROGS += debugfs_target_ids_pid_leak.sh
 TEST_PROGS += sysfs_update_removed_scheme_dir.sh
 TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
 
+EXTRA_CLEAN = __pycache__
+
 include ../lib.mk
-- 
cgit v1.2.3


From 8c211412c5dffd090eaea5ee033cd729f8e5f873 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 26 Aug 2024 20:03:30 -0700
Subject: selftests/damon: add execute permissions to test scripts

Some test scripts are missing executable permissions.  It causes warnings
that make the test output unnecessarily verbose.  Add executable
permissions.

Link: https://lkml.kernel.org/r/20240827030336.7930-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/damon_nr_regions.py                         | 0
 tools/testing/selftests/damon/damos_apply_interval.py                     | 0
 tools/testing/selftests/damon/damos_quota.py                              | 0
 tools/testing/selftests/damon/damos_quota_goal.py                         | 0
 tools/testing/selftests/damon/damos_tried_regions.py                      | 0
 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh              | 0
 .../selftests/damon/debugfs_target_ids_read_before_terminate_race.sh      | 0
 tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py  | 0
 .../selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py  | 0
 9 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tools/testing/selftests/damon/damon_nr_regions.py
 mode change 100644 => 100755 tools/testing/selftests/damon/damos_apply_interval.py
 mode change 100644 => 100755 tools/testing/selftests/damon/damos_quota.py
 mode change 100644 => 100755 tools/testing/selftests/damon/damos_quota_goal.py
 mode change 100644 => 100755 tools/testing/selftests/damon/damos_tried_regions.py
 mode change 100644 => 100755 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
 mode change 100644 => 100755 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
 mode change 100644 => 100755 tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py
 mode change 100644 => 100755 tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/damon_nr_regions.py b/tools/testing/selftests/damon/damon_nr_regions.py
old mode 100644
new mode 100755
diff --git a/tools/testing/selftests/damon/damos_apply_interval.py b/tools/testing/selftests/damon/damos_apply_interval.py
old mode 100644
new mode 100755
diff --git a/tools/testing/selftests/damon/damos_quota.py b/tools/testing/selftests/damon/damos_quota.py
old mode 100644
new mode 100755
diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py
old mode 100644
new mode 100755
diff --git a/tools/testing/selftests/damon/damos_tried_regions.py b/tools/testing/selftests/damon/damos_tried_regions.py
old mode 100644
new mode 100755
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
old mode 100644
new mode 100755
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
old mode 100644
new mode 100755
diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py
old mode 100644
new mode 100755
diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
old mode 100644
new mode 100755
-- 
cgit v1.2.3


From 7c65ae81ea86a6ed8086e1a5651acd766187f19b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 3 Sep 2024 21:54:28 -1000
Subject: sched_ext: Don't call put_prev_task_scx() before picking the next
 task

fd03c5b85855 ("sched: Rework pick_next_task()") changed the definition of
pick_next_task() from:

  pick_next_task() := pick_task() + set_next_task(.first = true)

to:

  pick_next_task(prev) := pick_task() + put_prev_task() + set_next_task(.first = true)

making invoking put_prev_task() pick_next_task()'s responsibility. This
reordering allows pick_task() to be shared between regular and core-sched
paths and put_prev_task() to know the next task.

sched_ext depended on put_prev_task_scx() enqueueing the current task before
pick_next_task_scx() is called. While pulling sched/core changes,
70cc76aa0d80 ("Merge branch 'tip/sched/core' into for-6.12") added an
explicit put_prev_task_scx() call for SCX tasks in pick_next_task_scx()
before picking the first task as a workaround.

Clean it up and adopt the conventions that other sched classes are
following.

The operation of keeping running the current task was spread and required
the task to be put on the local DSQ before picking:

  - balance_one() used SCX_TASK_BAL_KEEP to indicate that the task is still
    runnable, hasn't exhausted its slice, and thus should keep running.

  - put_prev_task_scx() enqueued the task to local DSQ if SCX_TASK_BAL_KEEP
    is set. It also called do_enqueue_task() with SCX_ENQ_LAST if it is the
    only runnable task. do_enqueue_task() in turn decided whether to use the
    local DSQ depending on SCX_OPS_ENQ_LAST.

Consolidate the logic in balance_one() as it always knows whether it is
going to keep the current task. balance_one() now considers all conditions
where the current task should be kept and uses SCX_TASK_BAL_KEEP to tell
pick_next_task_scx() to keep the current task instead of picking one from
the local DSQ. Accordingly, SCX_ENQ_LAST handling is removed from
put_prev_task_scx() and do_enqueue_task() and pick_next_task_scx() is
updated to pick the current task if SCX_TASK_BAL_KEEP is set.

The workaround put_prev_task[_scx]() calls are replaced with
put_prev_set_next_task().

This causes two behavior changes observable from the BPF scheduler:

- When a task keep running, it no longer goes through enqueue/dequeue cycle
  and thus ops.stopping/running() transitions. The new behavior is better
  and all the existing schedulers should be able to handle the new behavior.

- The BPF scheduler cannot keep executing the current task by enqueueing
  SCX_ENQ_LAST task to the local DSQ. If SCX_OPS_ENQ_LAST is specified, the
  BPF scheduler is responsible for resuming execution after each
  SCX_ENQ_LAST. SCX_OPS_ENQ_LAST is mostly useful for cases where scheduling
  decisions are not made on the local CPU - e.g. central or userspace-driven
  schedulin - and the new behavior is more logical and shouldn't pose any
  problems. SCX_OPS_ENQ_LAST demonstration from scx_qmap is dropped as it
  doesn't fit that well anymore and the last task handling is moved to the
  end of qmap_dispatch().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Vernet <void@manifault.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Cc: Changwoo Min <multics69@gmail.com>
Cc: Daniel Hodges <hodges.daniel.scott@gmail.com>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 kernel/sched/ext.c             | 130 ++++++++++++++++++++---------------------
 tools/sched_ext/scx_qmap.bpf.c |  22 +++++--
 2 files changed, 80 insertions(+), 72 deletions(-)

(limited to 'tools')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5e862c7c0e6b..be86dbfa75a8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -630,11 +630,8 @@ enum scx_enq_flags {
 	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
 	 * %SCX_ENQ_LAST flag set.
 	 *
-	 * If the BPF scheduler wants to continue executing the task,
-	 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
-	 * If the task gets queued on a different dsq or the BPF side, the BPF
-	 * scheduler is responsible for triggering a follow-up scheduling event.
-	 * Otherwise, Execution may stall.
+	 * The BPF scheduler is responsible for triggering a follow-up
+	 * scheduling event. Otherwise, Execution may stall.
 	 */
 	SCX_ENQ_LAST		= 1LLU << 41,
 
@@ -1852,12 +1849,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	if (!scx_rq_online(rq))
 		goto local;
 
-	if (scx_ops_bypassing()) {
-		if (enq_flags & SCX_ENQ_LAST)
-			goto local;
-		else
-			goto global;
-	}
+	if (scx_ops_bypassing())
+		goto global;
 
 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
 		goto direct;
@@ -1867,11 +1860,6 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	    unlikely(p->flags & PF_EXITING))
 		goto local;
 
-	/* see %SCX_OPS_ENQ_LAST */
-	if (!static_branch_unlikely(&scx_ops_enq_last) &&
-	    (enq_flags & SCX_ENQ_LAST))
-		goto local;
-
 	if (!SCX_HAS_OP(enqueue))
 		goto global;
 
@@ -2517,7 +2505,6 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
-	bool has_tasks = false;
 
 	lockdep_assert_rq_held(rq);
 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
@@ -2542,9 +2529,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
 		/*
 		 * If @prev is runnable & has slice left, it has priority and
 		 * fetching more just increases latency for the fetched tasks.
-		 * Tell put_prev_task_scx() to put @prev on local_dsq. If the
-		 * BPF scheduler wants to handle this explicitly, it should
-		 * implement ->cpu_released().
+		 * Tell pick_next_task_scx() to keep running @prev. If the BPF
+		 * scheduler wants to handle this explicitly, it should
+		 * implement ->cpu_release().
 		 *
 		 * See scx_ops_disable_workfn() for the explanation on the
 		 * bypassing test.
@@ -2570,7 +2557,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
 		goto has_tasks;
 
 	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
-		goto out;
+		goto no_tasks;
 
 	dspc->rq = rq;
 
@@ -2609,13 +2596,23 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
 		}
 	} while (dspc->nr_tasks);
 
-	goto out;
+no_tasks:
+	/*
+	 * Didn't find another task to run. Keep running @prev unless
+	 * %SCX_OPS_ENQ_LAST is in effect.
+	 */
+	if ((prev->scx.flags & SCX_TASK_QUEUED) &&
+	    (!static_branch_unlikely(&scx_ops_enq_last) || scx_ops_bypassing())) {
+		if (local)
+			prev->scx.flags |= SCX_TASK_BAL_KEEP;
+		goto has_tasks;
+	}
+	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+	return false;
 
 has_tasks:
-	has_tasks = true;
-out:
 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
-	return has_tasks;
+	return true;
 }
 
 static int balance_scx(struct rq *rq, struct task_struct *prev,
@@ -2728,25 +2725,16 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
 		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
 
-	/*
-	 * If we're being called from put_prev_task_balance(), balance_scx() may
-	 * have decided that @p should keep running.
-	 */
-	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
+	if (p->scx.flags & SCX_TASK_QUEUED) {
 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
-		set_task_runnable(rq, p);
-		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
-		return;
-	}
 
-	if (p->scx.flags & SCX_TASK_QUEUED) {
 		set_task_runnable(rq, p);
 
 		/*
-		 * If @p has slice left and balance_scx() didn't tag it for
-		 * keeping, @p is getting preempted by a higher priority
-		 * scheduler class or core-sched forcing a different task. Leave
-		 * it at the head of the local DSQ.
+		 * If @p has slice left and is being put, @p is getting
+		 * preempted by a higher priority scheduler class or core-sched
+		 * forcing a different task. Leave it at the head of the local
+		 * DSQ.
 		 */
 		if (p->scx.slice && !scx_ops_bypassing()) {
 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
@@ -2754,18 +2742,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 		}
 
 		/*
-		 * If we're in the pick_next_task path, balance_scx() should
-		 * have already populated the local DSQ if there are any other
-		 * available tasks. If empty, tell ops.enqueue() that @p is the
-		 * only one available for this cpu. ops.enqueue() should put it
-		 * on the local DSQ so that the subsequent pick_next_task_scx()
-		 * can find the task unless it wants to trigger a separate
-		 * follow-up scheduling event.
+		 * If @p is runnable but we're about to enter a lower
+		 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell
+		 * ops.enqueue() that @p is the only one available for this cpu,
+		 * which should trigger an explicit follow-up scheduling event.
 		 */
-		if (list_empty(&rq->scx.local_dsq.list))
+		if (sched_class_above(&ext_sched_class, next->sched_class)) {
+			WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
-		else
+		} else {
 			do_enqueue_task(rq, p, 0, -1);
+		}
 	}
 }
 
@@ -2780,27 +2767,33 @@ static struct task_struct *pick_next_task_scx(struct rq *rq,
 {
 	struct task_struct *p;
 
-	if (prev->sched_class == &ext_sched_class)
-		put_prev_task_scx(rq, prev, NULL);
-
-	p = first_local_task(rq);
-	if (!p)
-		return NULL;
-
-	if (prev->sched_class != &ext_sched_class)
-		prev->sched_class->put_prev_task(rq, prev, p);
-
-	set_next_task_scx(rq, p, true);
+	/*
+	 * If balance_scx() is telling us to keep running @prev, replenish slice
+	 * if necessary and keep running @prev. Otherwise, pop the first one
+	 * from the local DSQ.
+	 */
+	if (prev->scx.flags & SCX_TASK_BAL_KEEP) {
+		prev->scx.flags &= ~SCX_TASK_BAL_KEEP;
+		p = prev;
+		if (!p->scx.slice)
+			p->scx.slice = SCX_SLICE_DFL;
+	} else {
+		p = first_local_task(rq);
+		if (!p)
+			return NULL;
 
-	if (unlikely(!p->scx.slice)) {
-		if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
-			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
-					p->comm, p->pid);
-			scx_warned_zero_slice = true;
+		if (unlikely(!p->scx.slice)) {
+			if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
+				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+						p->comm, p->pid);
+				scx_warned_zero_slice = true;
+			}
+			p->scx.slice = SCX_SLICE_DFL;
 		}
-		p->scx.slice = SCX_SLICE_DFL;
 	}
 
+	put_prev_set_next_task(rq, prev, p);
+
 	return p;
 }
 
@@ -3875,12 +3868,13 @@ bool task_should_scx(struct task_struct *p)
  * to force global FIFO scheduling.
  *
  * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
+ *    %SCX_OPS_ENQ_LAST is also ignored.
  *
  * b. ops.dispatch() is ignored.
  *
- * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
- *    trusted. Whenever a tick triggers, the running task is rotated to the tail
- *    of the queue with core_sched_at touched.
+ * c. balance_scx() does not set %SCX_TASK_BAL_KEEP on non-zero slice as slice
+ *    can't be trusted. Whenever a tick triggers, the running task is rotated to
+ *    the tail of the queue with core_sched_at touched.
  *
  * d. pick_next_task() suppresses zero slice warning.
  *
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 892278f12dce..7e7f28f4117e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -205,8 +205,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 
 	/*
 	 * All enqueued tasks must have their core_sched_seq updated for correct
-	 * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
-	 * qmap_ops.flags.
+	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
 
@@ -214,7 +213,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 * If qmap_select_cpu() is telling us to or this is the last runnable
 	 * task on the CPU, enqueue locally.
 	 */
-	if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
+	if (tctx->force_local) {
 		tctx->force_local = false;
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
@@ -285,6 +284,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx *cpuc;
+	struct task_ctx *tctx;
 	u32 zero = 0, batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
@@ -349,6 +349,21 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 		cpuc->dsp_cnt = 0;
 	}
+
+	/*
+	 * No other tasks. @prev will keep running. Update its core_sched_seq as
+	 * if the task were enqueued and dispatched immediately.
+	 */
+	if (prev) {
+		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("task_ctx lookup failed");
+			return;
+		}
+
+		tctx->core_sched_seq =
+			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+	}
 }
 
 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
@@ -701,6 +716,5 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
-	       .flags			= SCX_OPS_ENQ_LAST,
 	       .timeout_ms		= 5000U,
 	       .name			= "qmap");
-- 
cgit v1.2.3


From 487355f111f98a74d86007c4df0ba9f0f9edc172 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:11:13 +0100
Subject: KVM: selftests: get-reg-list: add Permission Overlay registers

Add new system registers:
  - POR_EL1
  - POR_EL0

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240822151113.1479789-31-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/kvm/aarch64/get-reg-list.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
index 709d7d721760..ac661ebf6859 100644
--- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
+++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
@@ -40,6 +40,18 @@ static struct feature_id_reg feat_id_regs[] = {
 		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
 		4,
 		1
+	},
+	{
+		ARM64_SYS_REG(3, 0, 10, 2, 4),	/* POR_EL1 */
+		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
+		16,
+		1
+	},
+	{
+		ARM64_SYS_REG(3, 3, 10, 2, 4),	/* POR_EL0 */
+		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
+		16,
+		1
 	}
 };
 
@@ -468,6 +480,7 @@ static __u64 base_regs[] = {
 	ARM64_SYS_REG(3, 0, 10, 2, 0),	/* MAIR_EL1 */
 	ARM64_SYS_REG(3, 0, 10, 2, 2),	/* PIRE0_EL1 */
 	ARM64_SYS_REG(3, 0, 10, 2, 3),	/* PIR_EL1 */
+	ARM64_SYS_REG(3, 0, 10, 2, 4),	/* POR_EL1 */
 	ARM64_SYS_REG(3, 0, 10, 3, 0),	/* AMAIR_EL1 */
 	ARM64_SYS_REG(3, 0, 12, 0, 0),	/* VBAR_EL1 */
 	ARM64_SYS_REG(3, 0, 12, 1, 1),	/* DISR_EL1 */
@@ -475,6 +488,7 @@ static __u64 base_regs[] = {
 	ARM64_SYS_REG(3, 0, 13, 0, 4),	/* TPIDR_EL1 */
 	ARM64_SYS_REG(3, 0, 14, 1, 0),	/* CNTKCTL_EL1 */
 	ARM64_SYS_REG(3, 2, 0, 0, 0),	/* CSSELR_EL1 */
+	ARM64_SYS_REG(3, 3, 10, 2, 4),	/* POR_EL0 */
 	ARM64_SYS_REG(3, 3, 13, 0, 2),	/* TPIDR_EL0 */
 	ARM64_SYS_REG(3, 3, 13, 0, 3),	/* TPIDRRO_EL0 */
 	ARM64_SYS_REG(3, 3, 14, 0, 1),	/* CNTPCT_EL0 */
-- 
cgit v1.2.3


From 6354a0184c542f2b8fade9cb0eb843acd3310191 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:11:07 +0100
Subject: kselftest/arm64: move get_header()

Put this function in the header so that it can be used by other tests, without
needing to link to testcases.c.

This will be used by selftest/mm/protection_keys.c

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240822151113.1479789-25-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../selftests/arm64/signal/testcases/testcases.c   | 23 --------------------
 .../selftests/arm64/signal/testcases/testcases.h   | 25 ++++++++++++++++++++--
 2 files changed, 23 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index 674b88cc8c39..e4331440fed0 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -6,29 +6,6 @@
 
 #include "testcases.h"
 
-struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic,
-				size_t resv_sz, size_t *offset)
-{
-	size_t offs = 0;
-	struct _aarch64_ctx *found = NULL;
-
-	if (!head || resv_sz < HDR_SZ)
-		return found;
-
-	while (offs <= resv_sz - HDR_SZ &&
-	       head->magic != magic && head->magic) {
-		offs += head->size;
-		head = GET_RESV_NEXT_HEAD(head);
-	}
-	if (head->magic == magic) {
-		found = head;
-		if (offset)
-			*offset = offs;
-	}
-
-	return found;
-}
-
 bool validate_extra_context(struct extra_context *extra, char **err,
 			    void **extra_data, size_t *extra_size)
 {
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index 7727126347e0..3185e6875694 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -88,8 +88,29 @@ struct fake_sigframe {
 
 bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err);
 
-struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic,
-				size_t resv_sz, size_t *offset);
+static inline struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic,
+				size_t resv_sz, size_t *offset)
+{
+	size_t offs = 0;
+	struct _aarch64_ctx *found = NULL;
+
+	if (!head || resv_sz < HDR_SZ)
+		return found;
+
+	while (offs <= resv_sz - HDR_SZ &&
+	       head->magic != magic && head->magic) {
+		offs += head->size;
+		head = GET_RESV_NEXT_HEAD(head);
+	}
+	if (head->magic == magic) {
+		found = head;
+		if (offset)
+			*offset = offs;
+	}
+
+	return found;
+}
+
 
 static inline struct _aarch64_ctx *get_terminator(struct _aarch64_ctx *head,
 						  size_t resv_sz,
-- 
cgit v1.2.3


From 41bbcf7b4b046b4e7190c1866625aed0fe6f69f6 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:11:08 +0100
Subject: selftests: mm: move fpregs printing

arm64's fpregs are not at a constant offset from sigcontext. Since this is
not an important part of the test, don't print the fpregs pointer on arm64.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/r/20240822151113.1479789-26-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/mm/pkey-powerpc.h    | 1 +
 tools/testing/selftests/mm/pkey-x86.h        | 2 ++
 tools/testing/selftests/mm/protection_keys.c | 6 ++++++
 3 files changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h
index ae5df26104e5..6275d0f474b3 100644
--- a/tools/testing/selftests/mm/pkey-powerpc.h
+++ b/tools/testing/selftests/mm/pkey-powerpc.h
@@ -9,6 +9,7 @@
 #endif
 #define REG_IP_IDX		PT_NIP
 #define REG_TRAPNO		PT_TRAP
+#define MCONTEXT_FPREGS
 #define gregs			gp_regs
 #define fpregs			fp_regs
 #define si_pkey_offset		0x20
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
index 814758e109c0..b9170a26bfcb 100644
--- a/tools/testing/selftests/mm/pkey-x86.h
+++ b/tools/testing/selftests/mm/pkey-x86.h
@@ -15,6 +15,8 @@
 
 #endif
 
+#define MCONTEXT_FPREGS
+
 #ifndef PKEY_DISABLE_ACCESS
 # define PKEY_DISABLE_ACCESS	0x1
 #endif
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index eaa6d1fc5328..4337106a985e 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -314,7 +314,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 	ucontext_t *uctxt = vucontext;
 	int trapno;
 	unsigned long ip;
+#ifdef MCONTEXT_FPREGS
 	char *fpregs;
+#endif
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 	u32 *pkey_reg_ptr;
 	int pkey_reg_offset;
@@ -330,7 +332,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 
 	trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
 	ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
+#ifdef MCONTEXT_FPREGS
 	fpregs = (char *) uctxt->uc_mcontext.fpregs;
+#endif
 
 	dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
 			__func__, trapno, ip, si_code_str(si->si_code),
@@ -359,7 +363,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 #endif /* arch */
 
 	dprintf1("siginfo: %p\n", si);
+#ifdef MCONTEXT_FPREGS
 	dprintf1(" fpregs: %p\n", fpregs);
+#endif
 
 	if ((si->si_code == SEGV_MAPERR) ||
 	    (si->si_code == SEGV_ACCERR) ||
-- 
cgit v1.2.3


From f5b5ea51f78f2ebd94d5a77702bbe5eee8924b50 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:11:09 +0100
Subject: selftests: mm: make protection_keys test work on arm64

The encoding of the pkey register differs on arm64, than on x86/ppc. On those
platforms, a bit in the register is used to disable permissions, for arm64, a
bit enabled in the register indicates that the permission is allowed.

This drops two asserts of the form:
	 assert(read_pkey_reg() <= orig_pkey_reg);
Because on arm64 this doesn't hold, due to the encoding.

The pkey must be reset to both access allow and write allow in the signal
handler. pkey_access_allow() works currently for PowerPC as the
PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE have overlapping bits set.

Access to the uc_mcontext is abstracted, as arm64 has a different structure.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/r/20240822151113.1479789-27-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../selftests/arm64/signal/testcases/testcases.h   |   3 +
 tools/testing/selftests/mm/Makefile                |   2 +-
 tools/testing/selftests/mm/pkey-arm64.h            | 139 +++++++++++++++++++++
 tools/testing/selftests/mm/pkey-helpers.h          |   8 ++
 tools/testing/selftests/mm/pkey-powerpc.h          |   2 +
 tools/testing/selftests/mm/pkey-x86.h              |   2 +
 tools/testing/selftests/mm/protection_keys.c       | 103 +++++++++++++--
 7 files changed, 247 insertions(+), 12 deletions(-)
 create mode 100644 tools/testing/selftests/mm/pkey-arm64.h

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index 3185e6875694..9872b8912714 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -26,6 +26,9 @@
 #define HDR_SZ \
 	sizeof(struct _aarch64_ctx)
 
+#define GET_UC_RESV_HEAD(uc) \
+	(struct _aarch64_ctx *)(&(uc->uc_mcontext.__reserved))
+
 #define GET_SF_RESV_HEAD(sf) \
 	(struct _aarch64_ctx *)(&(sf).uc.uc_mcontext.__reserved)
 
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 7b8a5def54a1..b474ac7be8a5 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -104,7 +104,7 @@ TEST_GEN_FILES += $(BINARIES_64)
 endif
 else
 
-ifneq (,$(findstring $(ARCH),powerpc))
+ifneq (,$(filter $(ARCH),arm64 powerpc))
 TEST_GEN_FILES += protection_keys
 endif
 
diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
new file mode 100644
index 000000000000..580e1b0bb38e
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Arm Ltd.
+ */
+
+#ifndef _PKEYS_ARM64_H
+#define _PKEYS_ARM64_H
+
+#include "vm_util.h"
+/* for signal frame parsing */
+#include "../arm64/signal/testcases/testcases.h"
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key	288
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc		289
+# define SYS_pkey_free		290
+#endif
+#define MCONTEXT_IP(mc)		mc.pc
+#define MCONTEXT_TRAPNO(mc)	-1
+
+#define PKEY_MASK		0xf
+
+#define POE_NONE		0x0
+#define POE_X			0x2
+#define POE_RX			0x3
+#define POE_RWX			0x7
+
+#define NR_PKEYS		8
+#define NR_RESERVED_PKEYS	1 /* pkey-0 */
+
+#define PKEY_ALLOW_ALL		0x77777777
+
+#define PKEY_BITS_PER_PKEY	4
+#define PAGE_SIZE		sysconf(_SC_PAGESIZE)
+#undef HPAGE_SIZE
+#define HPAGE_SIZE		default_huge_page_size()
+
+/* 4-byte instructions * 16384 = 64K page */
+#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
+
+static inline u64 __read_pkey_reg(void)
+{
+	u64 pkey_reg = 0;
+
+	// POR_EL0
+	asm volatile("mrs %0, S3_3_c10_c2_4" : "=r" (pkey_reg));
+
+	return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+	u64 por = pkey_reg;
+
+	dprintf4("%s() changing %016llx to %016llx\n",
+			 __func__, __read_pkey_reg(), pkey_reg);
+
+	// POR_EL0
+	asm volatile("msr S3_3_c10_c2_4, %0\nisb" :: "r" (por) :);
+
+	dprintf4("%s() pkey register after changing %016llx to %016llx\n",
+			__func__, __read_pkey_reg(), pkey_reg);
+}
+
+static inline int cpu_has_pkeys(void)
+{
+	/* No simple way to determine this */
+	return 1;
+}
+
+static inline u32 pkey_bit_position(int pkey)
+{
+	return pkey * PKEY_BITS_PER_PKEY;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+	return NR_RESERVED_PKEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+}
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+	return PTR_ERR_ENOTSUP;
+}
+
+#define set_pkey_bits	set_pkey_bits
+static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+	u32 shift = pkey_bit_position(pkey);
+	u64 new_val = POE_RWX;
+
+	/* mask out bits from pkey in old value */
+	reg &= ~((u64)PKEY_MASK << shift);
+
+	if (flags & PKEY_DISABLE_ACCESS)
+		new_val = POE_X;
+	else if (flags & PKEY_DISABLE_WRITE)
+		new_val = POE_RX;
+
+	/* OR in new bits for pkey */
+	reg |= new_val << shift;
+
+	return reg;
+}
+
+#define get_pkey_bits	get_pkey_bits
+static inline u64 get_pkey_bits(u64 reg, int pkey)
+{
+	u32 shift = pkey_bit_position(pkey);
+	/*
+	 * shift down the relevant bits to the lowest four, then
+	 * mask off all the other higher bits
+	 */
+	u32 perm = (reg >> shift) & PKEY_MASK;
+
+	if (perm == POE_X)
+		return PKEY_DISABLE_ACCESS;
+	if (perm == POE_RX)
+		return PKEY_DISABLE_WRITE;
+	return 0;
+}
+
+static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
+{
+	struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt);
+	struct poe_context *poe_ctx =
+		(struct poe_context *) get_header(ctx, POE_MAGIC,
+						sizeof(uctxt->uc_mcontext), NULL);
+	if (poe_ctx)
+		poe_ctx->por_el0 = pkey;
+}
+
+#endif /* _PKEYS_ARM64_H */
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 1af3156a9db8..15608350fc01 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -91,12 +91,17 @@ void record_pkey_malloc(void *ptr, long size, int prot);
 #include "pkey-x86.h"
 #elif defined(__powerpc64__) /* arch */
 #include "pkey-powerpc.h"
+#elif defined(__aarch64__) /* arch */
+#include "pkey-arm64.h"
 #else /* arch */
 #error Architecture not supported
 #endif /* arch */
 
+#ifndef PKEY_MASK
 #define PKEY_MASK	(PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+#endif
 
+#ifndef set_pkey_bits
 static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
 {
 	u32 shift = pkey_bit_position(pkey);
@@ -106,7 +111,9 @@ static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
 	reg |= (flags & PKEY_MASK) << shift;
 	return reg;
 }
+#endif
 
+#ifndef get_pkey_bits
 static inline u64 get_pkey_bits(u64 reg, int pkey)
 {
 	u32 shift = pkey_bit_position(pkey);
@@ -116,6 +123,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey)
 	 */
 	return ((reg >> shift) & PKEY_MASK);
 }
+#endif
 
 extern u64 shadow_pkey_reg;
 
diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h
index 6275d0f474b3..3d0c0bdae5bc 100644
--- a/tools/testing/selftests/mm/pkey-powerpc.h
+++ b/tools/testing/selftests/mm/pkey-powerpc.h
@@ -8,6 +8,8 @@
 # define SYS_pkey_free		385
 #endif
 #define REG_IP_IDX		PT_NIP
+#define MCONTEXT_IP(mc)		mc.gp_regs[REG_IP_IDX]
+#define MCONTEXT_TRAPNO(mc)	mc.gp_regs[REG_TRAPNO]
 #define REG_TRAPNO		PT_TRAP
 #define MCONTEXT_FPREGS
 #define gregs			gp_regs
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
index b9170a26bfcb..5f28e26a2511 100644
--- a/tools/testing/selftests/mm/pkey-x86.h
+++ b/tools/testing/selftests/mm/pkey-x86.h
@@ -15,6 +15,8 @@
 
 #endif
 
+#define MCONTEXT_IP(mc)		mc.gregs[REG_IP_IDX]
+#define MCONTEXT_TRAPNO(mc)	mc.gregs[REG_TRAPNO]
 #define MCONTEXT_FPREGS
 
 #ifndef PKEY_DISABLE_ACCESS
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 4337106a985e..0789981b72b9 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -147,7 +147,7 @@ void abort_hooks(void)
  * will then fault, which makes sure that the fault code handles
  * execute-only memory properly.
  */
-#ifdef __powerpc64__
+#if defined(__powerpc64__) || defined(__aarch64__)
 /* This way, both 4K and 64K alignment are maintained */
 __attribute__((__aligned__(65536)))
 #else
@@ -212,7 +212,6 @@ void pkey_disable_set(int pkey, int flags)
 	unsigned long syscall_flags = 0;
 	int ret;
 	int pkey_rights;
-	u64 orig_pkey_reg = read_pkey_reg();
 
 	dprintf1("START->%s(%d, 0x%x)\n", __func__,
 		pkey, flags);
@@ -242,8 +241,6 @@ void pkey_disable_set(int pkey, int flags)
 
 	dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
 		__func__, pkey, read_pkey_reg());
-	if (flags)
-		pkey_assert(read_pkey_reg() >= orig_pkey_reg);
 	dprintf1("END<---%s(%d, 0x%x)\n", __func__,
 		pkey, flags);
 }
@@ -253,7 +250,6 @@ void pkey_disable_clear(int pkey, int flags)
 	unsigned long syscall_flags = 0;
 	int ret;
 	int pkey_rights = hw_pkey_get(pkey, syscall_flags);
-	u64 orig_pkey_reg = read_pkey_reg();
 
 	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
 
@@ -273,8 +269,6 @@ void pkey_disable_clear(int pkey, int flags)
 
 	dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
 			pkey, read_pkey_reg());
-	if (flags)
-		assert(read_pkey_reg() <= orig_pkey_reg);
 }
 
 void pkey_write_allow(int pkey)
@@ -330,8 +324,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 			__func__, __LINE__,
 			__read_pkey_reg(), shadow_pkey_reg);
 
-	trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
-	ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
+	trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext);
+	ip = MCONTEXT_IP(uctxt->uc_mcontext);
 #ifdef MCONTEXT_FPREGS
 	fpregs = (char *) uctxt->uc_mcontext.fpregs;
 #endif
@@ -395,6 +389,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 #elif defined(__powerpc64__) /* arch */
 	/* restore access and let the faulting instruction continue */
 	pkey_access_allow(siginfo_pkey);
+#elif defined(__aarch64__)
+	aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL);
 #endif /* arch */
 	pkey_faults++;
 	dprintf1("<<<<==================================================\n");
@@ -908,7 +904,9 @@ void expected_pkey_fault(int pkey)
 	 * test program continue.  We now have to restore it.
 	 */
 	if (__read_pkey_reg() != 0)
-#else /* arch */
+#elif defined(__aarch64__)
+	if (__read_pkey_reg() != PKEY_ALLOW_ALL)
+#else
 	if (__read_pkey_reg() != shadow_pkey_reg)
 #endif /* arch */
 		pkey_assert(0);
@@ -1498,6 +1496,11 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
 	lots_o_noops_around_write(&scratch);
 	do_not_expect_pkey_fault("executing on PROT_EXEC memory");
 	expect_fault_on_read_execonly_key(p1, pkey);
+
+	// Reset back to PROT_EXEC | PROT_READ for architectures that support
+	// non-PKEY execute-only permissions.
+	ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey);
+	pkey_assert(!ret);
 }
 
 void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
@@ -1671,6 +1674,84 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
 }
 #endif
 
+#if defined(__aarch64__)
+void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
+{
+	pid_t child;
+	int status, ret;
+	struct iovec iov;
+	u64 trace_pkey;
+	/* Just a random pkey value.. */
+	u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) |
+			(POE_NONE << PKEY_BITS_PER_PKEY) |
+			POE_RWX;
+
+	child = fork();
+	pkey_assert(child >= 0);
+	dprintf3("[%d] fork() ret: %d\n", getpid(), child);
+	if (!child) {
+		ptrace(PTRACE_TRACEME, 0, 0, 0);
+
+		/* Stop and allow the tracer to modify PKRU directly */
+		raise(SIGSTOP);
+
+		/*
+		 * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+		 * checking
+		 */
+		if (__read_pkey_reg() != new_pkey)
+			exit(1);
+
+		raise(SIGSTOP);
+
+		exit(0);
+	}
+
+	pkey_assert(child == waitpid(child, &status, 0));
+	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+	pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+	iov.iov_base = &trace_pkey;
+	iov.iov_len = 8;
+	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov);
+	pkey_assert(ret == 0);
+	pkey_assert(trace_pkey == read_pkey_reg());
+
+	trace_pkey = new_pkey;
+
+	ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov);
+	pkey_assert(ret == 0);
+
+	/* Test that the modification is visible in ptrace before any execution */
+	memset(&trace_pkey, 0, sizeof(trace_pkey));
+	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov);
+	pkey_assert(ret == 0);
+	pkey_assert(trace_pkey == new_pkey);
+
+	/* Execute the tracee */
+	ret = ptrace(PTRACE_CONT, child, 0, 0);
+	pkey_assert(ret == 0);
+
+	/* Test that the tracee saw the PKRU value change */
+	pkey_assert(child == waitpid(child, &status, 0));
+	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+	pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+	/* Test that the modification is visible in ptrace after execution */
+	memset(&trace_pkey, 0, sizeof(trace_pkey));
+	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov);
+	pkey_assert(ret == 0);
+	pkey_assert(trace_pkey == new_pkey);
+
+	ret = ptrace(PTRACE_CONT, child, 0, 0);
+	pkey_assert(ret == 0);
+	pkey_assert(child == waitpid(child, &status, 0));
+	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+	pkey_assert(WIFEXITED(status));
+	pkey_assert(WEXITSTATUS(status) == 0);
+}
+#endif
+
 void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
 {
 	int size = PAGE_SIZE;
@@ -1706,7 +1787,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
 	test_pkey_syscalls_bad_args,
 	test_pkey_alloc_exhaust,
 	test_pkey_alloc_free_attach_pkey0,
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
 	test_ptrace_modifies_pkru,
 #endif
 };
-- 
cgit v1.2.3


From fabf056278b4ccddea4a944a1635fee44033b71f Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:11:10 +0100
Subject: kselftest/arm64: add HWCAP test for FEAT_S1POE

Check that when POE is enabled, the POR_EL0 register is accessible.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240822151113.1479789-28-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index d8909b2b535a..f2d6007a2b98 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -156,6 +156,12 @@ static void pmull_sigill(void)
 	asm volatile(".inst 0x0ee0e000" : : : );
 }
 
+static void poe_sigill(void)
+{
+	/* mrs x0, POR_EL0 */
+	asm volatile("mrs x0, S3_3_C10_C2_4" : : : "x0");
+}
+
 static void rng_sigill(void)
 {
 	asm volatile("mrs x0, S3_3_C2_C4_0" : : : "x0");
@@ -601,6 +607,14 @@ static const struct hwcap_data {
 		.cpuinfo = "pmull",
 		.sigill_fn = pmull_sigill,
 	},
+	{
+		.name = "POE",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_POE,
+		.cpuinfo = "poe",
+		.sigill_fn = poe_sigill,
+		.sigill_reliable = true,
+	},
 	{
 		.name = "RNG",
 		.at_hwcap = AT_HWCAP2,
-- 
cgit v1.2.3


From d3c6e5b1093ac68ef66e3c78564e8fb958180bac Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:11:11 +0100
Subject: kselftest/arm64: parse POE_MAGIC in a signal frame

Teach the signal frame parsing about the new POE frame, avoids warning when it
is generated.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240822151113.1479789-29-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/signal/testcases/testcases.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index e4331440fed0..e6daa94fcd2e 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -161,6 +161,10 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 			if (head->size != sizeof(struct esr_context))
 				*err = "Bad size for esr_context";
 			break;
+		case POE_MAGIC:
+			if (head->size != sizeof(struct poe_context))
+				*err = "Bad size for poe_context";
+			break;
 		case TPIDR2_MAGIC:
 			if (head->size != sizeof(struct tpidr2_context))
 				*err = "Bad size for tpidr2_context";
-- 
cgit v1.2.3


From 6a428d63717add52bba4175a7fde54d1f9d166e0 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:11:12 +0100
Subject: kselftest/arm64: Add test case for POR_EL0 signal frame records

Ensure that we get signal context for POR_EL0 if and only if POE is present
on the system.

Copied from the TPIDR2 test.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240822151113.1479789-30-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/signal/.gitignore    |  1 +
 .../selftests/arm64/signal/testcases/poe_siginfo.c | 86 ++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
index 1ce5b5eac386..b2f2bfd5c6aa 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -2,6 +2,7 @@
 mangle_*
 fake_sigreturn_*
 fpmr_*
+poe_*
 sme_*
 ssve_*
 sve_*
diff --git a/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c
new file mode 100644
index 000000000000..36bd9940ee05
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Arm Limited
+ *
+ * Verify that the POR_EL0 register context in signal frames is set up as
+ * expected.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+#include <asm/sigcontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 128];
+} context;
+
+#define SYS_POR_EL0 "S3_3_C10_C2_4"
+
+static uint64_t get_por_el0(void)
+{
+	uint64_t val;
+
+	asm volatile(
+		"mrs	%0, " SYS_POR_EL0 "\n"
+		: "=r"(val)
+		:
+		: );
+
+	return val;
+}
+
+int poe_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+	struct poe_context *poe_ctx;
+	size_t offset;
+	bool in_sigframe;
+	bool have_poe;
+	__u64 orig_poe;
+
+	have_poe = getauxval(AT_HWCAP2) & HWCAP2_POE;
+	if (have_poe)
+		orig_poe = get_por_el0();
+
+	if (!get_current_context(td, &context.uc, sizeof(context)))
+		return 1;
+
+	poe_ctx = (struct poe_context *)
+		get_header(head, POE_MAGIC, td->live_sz, &offset);
+
+	in_sigframe = poe_ctx != NULL;
+
+	fprintf(stderr, "POR_EL0 sigframe %s on system %s POE\n",
+		in_sigframe ? "present" : "absent",
+		have_poe ? "with" : "without");
+
+	td->pass = (in_sigframe == have_poe);
+
+	/*
+	 * Check that the value we read back was the one present at
+	 * the time that the signal was triggered.
+	 */
+	if (have_poe && poe_ctx) {
+		if (poe_ctx->por_el0 != orig_poe) {
+			fprintf(stderr, "POR_EL0 in frame is %llx, was %llx\n",
+				poe_ctx->por_el0, orig_poe);
+			td->pass = false;
+		}
+	}
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "POR_EL0",
+	.descr = "Validate that POR_EL0 is present as expected",
+	.timeout = 3,
+	.run = poe_present,
+};
-- 
cgit v1.2.3


From 5c26516f090363b548c51ce892b8bcc46c7dcdbc Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Tue, 3 Sep 2024 00:06:10 +0800
Subject: selftests: add selftest for UDP SO_PEEK_OFF support

Add the SO_PEEK_OFF selftest for UDP. In this patch, I mainly do
three things:
1. rename tcp_so_peek_off.c
2. adjust for UDP protocol
3. add selftests into it

Suggested-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/.gitignore        |   1 +
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/sk_so_peek_off.c  | 202 ++++++++++++++++++++++++++
 tools/testing/selftests/net/tcp_so_peek_off.c | 183 -----------------------
 4 files changed, 204 insertions(+), 184 deletions(-)
 create mode 100644 tools/testing/selftests/net/sk_so_peek_off.c
 delete mode 100644 tools/testing/selftests/net/tcp_so_peek_off.c

(limited to 'tools')

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 666ab7d9390b..923bf098e2eb 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -34,6 +34,7 @@ scm_pidfd
 scm_rights
 sk_bind_sendto_listen
 sk_connect_zero_addr
+sk_so_peek_off
 socket
 so_incoming_cpu
 so_netns_cookie
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 1179e3261bef..d5029f978aa9 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -80,7 +80,7 @@ TEST_PROGS += io_uring_zerocopy_tx.sh
 TEST_GEN_FILES += bind_bhash
 TEST_GEN_PROGS += sk_bind_sendto_listen
 TEST_GEN_PROGS += sk_connect_zero_addr
-TEST_GEN_PROGS += tcp_so_peek_off
+TEST_GEN_PROGS += sk_so_peek_off
 TEST_PROGS += test_ingress_egress_chaining.sh
 TEST_GEN_PROGS += so_incoming_cpu
 TEST_PROGS += sctp_vrf.sh
diff --git a/tools/testing/selftests/net/sk_so_peek_off.c b/tools/testing/selftests/net/sk_so_peek_off.c
new file mode 100644
index 000000000000..d87dd8d8d491
--- /dev/null
+++ b/tools/testing/selftests/net/sk_so_peek_off.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include "../kselftest.h"
+
+static char *afstr(int af, int proto)
+{
+	if (proto == IPPROTO_TCP)
+		return af == AF_INET ? "TCP/IPv4" : "TCP/IPv6";
+	else
+		return af == AF_INET ? "UDP/IPv4" : "UDP/IPv6";
+}
+
+int sk_peek_offset_probe(sa_family_t af, int proto)
+{
+	int type = (proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM);
+	int optv = 0;
+	int ret = 0;
+	int s;
+
+	s = socket(af, type, proto);
+	if (s < 0) {
+		ksft_perror("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			ret = 1;
+		else
+			printf("%s does not support SO_PEEK_OFF\n", afstr(af, proto));
+		close(s);
+	}
+	return ret;
+}
+
+static void sk_peek_offset_set(int s, int offset)
+{
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		ksft_perror("Failed to set SO_PEEK_OFF value\n");
+}
+
+static int sk_peek_offset_get(int s)
+{
+	int offset;
+	socklen_t len = sizeof(offset);
+
+	if (getsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, &len))
+		ksft_perror("Failed to get SO_PEEK_OFF value\n");
+	return offset;
+}
+
+static int sk_peek_offset_test(sa_family_t af, int proto)
+{
+	int type = (proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM);
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in a4;
+		struct sockaddr_in6 a6;
+	} a;
+	int res = 0;
+	int s[2] = {0, 0};
+	int recv_sock = 0;
+	int offset = 0;
+	ssize_t len;
+	char buf[2];
+
+	memset(&a, 0, sizeof(a));
+	a.sa.sa_family = af;
+
+	s[0] = recv_sock = socket(af, type, proto);
+	s[1] = socket(af, type, proto);
+
+	if (s[0] < 0 || s[1] < 0) {
+		ksft_perror("Temporary socket creation failed\n");
+		goto out;
+	}
+	if (bind(s[0], &a.sa, sizeof(a)) < 0) {
+		ksft_perror("Temporary socket bind() failed\n");
+		goto out;
+	}
+	if (getsockname(s[0], &a.sa, &((socklen_t) { sizeof(a) })) < 0) {
+		ksft_perror("Temporary socket getsockname() failed\n");
+		goto out;
+	}
+	if (proto == IPPROTO_TCP && listen(s[0], 0) < 0) {
+		ksft_perror("Temporary socket listen() failed\n");
+		goto out;
+	}
+	if (connect(s[1], &a.sa, sizeof(a)) < 0) {
+		ksft_perror("Temporary socket connect() failed\n");
+		goto out;
+	}
+	if (proto == IPPROTO_TCP) {
+		recv_sock = accept(s[0], NULL, NULL);
+		if (recv_sock <= 0) {
+			ksft_perror("Temporary socket accept() failed\n");
+			goto out;
+		}
+	}
+
+	/* Some basic tests of getting/setting offset */
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != -1) {
+		ksft_perror("Initial value of socket offset not -1\n");
+		goto out;
+	}
+	sk_peek_offset_set(recv_sock, 0);
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != 0) {
+		ksft_perror("Failed to set socket offset to 0\n");
+		goto out;
+	}
+
+	/* Transfer a message */
+	if (send(s[1], (char *)("ab"), 2, 0) != 2) {
+		ksft_perror("Temporary probe socket send() failed\n");
+		goto out;
+	}
+	/* Read first byte */
+	len = recv(recv_sock, buf, 1, MSG_PEEK);
+	if (len != 1 || buf[0] != 'a') {
+		ksft_perror("Failed to read first byte of message\n");
+		goto out;
+	}
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != 1) {
+		ksft_perror("Offset not forwarded correctly at first byte\n");
+		goto out;
+	}
+	/* Try to read beyond last byte */
+	len = recv(recv_sock, buf, 2, MSG_PEEK);
+	if (len != 1 || buf[0] != 'b') {
+		ksft_perror("Failed to read last byte of message\n");
+		goto out;
+	}
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != 2) {
+		ksft_perror("Offset not forwarded correctly at last byte\n");
+		goto out;
+	}
+	/* Flush message */
+	len = recv(recv_sock, buf, 2, MSG_TRUNC);
+	if (len != 2) {
+		ksft_perror("Failed to flush message\n");
+		goto out;
+	}
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != 0) {
+		ksft_perror("Offset not reverted correctly after flush\n");
+		goto out;
+	}
+
+	printf("%s with MSG_PEEK_OFF works correctly\n", afstr(af, proto));
+	res = 1;
+out:
+	if (proto == IPPROTO_TCP && recv_sock >= 0)
+		close(recv_sock);
+	if (s[1] >= 0)
+		close(s[1]);
+	if (s[0] >= 0)
+		close(s[0]);
+	return res;
+}
+
+static int do_test(int proto)
+{
+	int res4, res6;
+
+	res4 = sk_peek_offset_probe(AF_INET, proto);
+	res6 = sk_peek_offset_probe(AF_INET6, proto);
+
+	if (!res4 && !res6)
+		return KSFT_SKIP;
+
+	if (res4)
+		res4 = sk_peek_offset_test(AF_INET, proto);
+
+	if (res6)
+		res6 = sk_peek_offset_test(AF_INET6, proto);
+
+	if (!res4 || !res6)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+int main(void)
+{
+	int restcp, resudp;
+
+	restcp = do_test(IPPROTO_TCP);
+	resudp = do_test(IPPROTO_UDP);
+	if (restcp == KSFT_FAIL || resudp == KSFT_FAIL)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/net/tcp_so_peek_off.c b/tools/testing/selftests/net/tcp_so_peek_off.c
deleted file mode 100644
index df8a39d9d3c3..000000000000
--- a/tools/testing/selftests/net/tcp_so_peek_off.c
+++ /dev/null
@@ -1,183 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include "../kselftest.h"
-
-static char *afstr(int af)
-{
-	return af == AF_INET ? "TCP/IPv4" : "TCP/IPv6";
-}
-
-int tcp_peek_offset_probe(sa_family_t af)
-{
-	int optv = 0;
-	int ret = 0;
-	int s;
-
-	s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
-	if (s < 0) {
-		ksft_perror("Temporary TCP socket creation failed");
-	} else {
-		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
-			ret = 1;
-		else
-			printf("%s does not support SO_PEEK_OFF\n", afstr(af));
-		close(s);
-	}
-	return ret;
-}
-
-static void tcp_peek_offset_set(int s, int offset)
-{
-	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
-		ksft_perror("Failed to set SO_PEEK_OFF value\n");
-}
-
-static int tcp_peek_offset_get(int s)
-{
-	int offset;
-	socklen_t len = sizeof(offset);
-
-	if (getsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, &len))
-		ksft_perror("Failed to get SO_PEEK_OFF value\n");
-	return offset;
-}
-
-static int tcp_peek_offset_test(sa_family_t af)
-{
-	union {
-		struct sockaddr sa;
-		struct sockaddr_in a4;
-		struct sockaddr_in6 a6;
-	} a;
-	int res = 0;
-	int s[2] = {0, 0};
-	int recv_sock = 0;
-	int offset = 0;
-	ssize_t len;
-	char buf;
-
-	memset(&a, 0, sizeof(a));
-	a.sa.sa_family = af;
-
-	s[0] = socket(af, SOCK_STREAM, IPPROTO_TCP);
-	s[1] = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
-
-	if (s[0] < 0 || s[1] < 0) {
-		ksft_perror("Temporary socket creation failed\n");
-		goto out;
-	}
-	if (bind(s[0], &a.sa, sizeof(a)) < 0) {
-		ksft_perror("Temporary socket bind() failed\n");
-		goto out;
-	}
-	if (getsockname(s[0], &a.sa, &((socklen_t) { sizeof(a) })) < 0) {
-		ksft_perror("Temporary socket getsockname() failed\n");
-		goto out;
-	}
-	if (listen(s[0], 0) < 0) {
-		ksft_perror("Temporary socket listen() failed\n");
-		goto out;
-	}
-	if (connect(s[1], &a.sa, sizeof(a)) >= 0 || errno != EINPROGRESS) {
-		ksft_perror("Temporary socket connect() failed\n");
-		goto out;
-	}
-	recv_sock = accept(s[0], NULL, NULL);
-	if (recv_sock <= 0) {
-		ksft_perror("Temporary socket accept() failed\n");
-		goto out;
-	}
-
-	/* Some basic tests of getting/setting offset */
-	offset = tcp_peek_offset_get(recv_sock);
-	if (offset != -1) {
-		ksft_perror("Initial value of socket offset not -1\n");
-		goto out;
-	}
-	tcp_peek_offset_set(recv_sock, 0);
-	offset = tcp_peek_offset_get(recv_sock);
-	if (offset != 0) {
-		ksft_perror("Failed to set socket offset to 0\n");
-		goto out;
-	}
-
-	/* Transfer a message */
-	if (send(s[1], (char *)("ab"), 2, 0) <= 0 || errno != EINPROGRESS) {
-		ksft_perror("Temporary probe socket send() failed\n");
-		goto out;
-	}
-	/* Read first byte */
-	len = recv(recv_sock, &buf, 1, MSG_PEEK);
-	if (len != 1 || buf != 'a') {
-		ksft_perror("Failed to read first byte of message\n");
-		goto out;
-	}
-	offset = tcp_peek_offset_get(recv_sock);
-	if (offset != 1) {
-		ksft_perror("Offset not forwarded correctly at first byte\n");
-		goto out;
-	}
-	/* Try to read beyond last byte */
-	len = recv(recv_sock, &buf, 2, MSG_PEEK);
-	if (len != 1 || buf != 'b') {
-		ksft_perror("Failed to read last byte of message\n");
-		goto out;
-	}
-	offset = tcp_peek_offset_get(recv_sock);
-	if (offset != 2) {
-		ksft_perror("Offset not forwarded correctly at last byte\n");
-		goto out;
-	}
-	/* Flush message */
-	len = recv(recv_sock, NULL, 2, MSG_TRUNC);
-	if (len != 2) {
-		ksft_perror("Failed to flush message\n");
-		goto out;
-	}
-	offset = tcp_peek_offset_get(recv_sock);
-	if (offset != 0) {
-		ksft_perror("Offset not reverted correctly after flush\n");
-		goto out;
-	}
-
-	printf("%s with MSG_PEEK_OFF works correctly\n", afstr(af));
-	res = 1;
-out:
-	if (recv_sock >= 0)
-		close(recv_sock);
-	if (s[1] >= 0)
-		close(s[1]);
-	if (s[0] >= 0)
-		close(s[0]);
-	return res;
-}
-
-int main(void)
-{
-	int res4, res6;
-
-	res4 = tcp_peek_offset_probe(AF_INET);
-	res6 = tcp_peek_offset_probe(AF_INET6);
-
-	if (!res4 && !res6)
-		return KSFT_SKIP;
-
-	if (res4)
-		res4 = tcp_peek_offset_test(AF_INET);
-
-	if (res6)
-		res6 = tcp_peek_offset_test(AF_INET6);
-
-	if (!res4 || !res6)
-		return KSFT_FAIL;
-
-	return KSFT_PASS;
-}
-- 
cgit v1.2.3


From dd7c445beb7baf0ed071aba94341857b07fde69b Mon Sep 17 00:00:00 2001
From: "Yo-Jung (Leo) Lin" <0xff07@gmail.com>
Date: Sun, 25 Aug 2024 17:53:50 +0800
Subject: pm-graph: Make git ignore sleepgraph.py artifacts

By default, sleepgraph.py creates suspend-{date}-{time} directories
to store artifacts, or suspend-{date}-{time}-xN if the --multi option
is used.

Ignore those directories by adding a .gitignore file.

Signed-off-by: Yo-Jung (Leo) Lin <0xff07@gmail.com>
Acked-by: Todd Brandt <todd.e.brandt@linux.intel.com>
Link: https://patch.msgid.link/20240825095353.7578-1-0xff07@gmail.com
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/power/pm-graph/.gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 tools/power/pm-graph/.gitignore

(limited to 'tools')

diff --git a/tools/power/pm-graph/.gitignore b/tools/power/pm-graph/.gitignore
new file mode 100644
index 000000000000..37762a8a06d6
--- /dev/null
+++ b/tools/power/pm-graph/.gitignore
@@ -0,0 +1,3 @@
+# sleepgraph.py artifacts
+suspend-[0-9]*-[0-9]*
+suspend-[0-9]*-[0-9]*-x[0-9]*
-- 
cgit v1.2.3


From 387ce37ed57c5d92400c5cff9d5c887f95db0236 Mon Sep 17 00:00:00 2001
From: Amit Vadhavana <av2082000@gmail.com>
Date: Sun, 25 Aug 2024 16:36:20 +0530
Subject: pm-graph: Update directory handling and installation process in
 Makefile

 - Standardize directory variables to support more flexible installations.
 - Add copyright and licensing information to the Makefile.
 - Introduce ".PHONY" declarations to ensure that specific targets are always
   executed, regardless of the presence of files with matching names.
 - Add a help target to provide usage instructions.

Signed-off-by: Amit Vadhavana <av2082000@gmail.com>
Acked-by: Todd Brandt <todd.e.brandt@linux.intel.com>
Link: https://patch.msgid.link/Update directory handling and installation process in Makefile
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/power/pm-graph/Makefile | 111 +++++++++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 38 deletions(-)

(limited to 'tools')

diff --git a/tools/power/pm-graph/Makefile b/tools/power/pm-graph/Makefile
index b5310832c19c..aeddbaf2d4c4 100644
--- a/tools/power/pm-graph/Makefile
+++ b/tools/power/pm-graph/Makefile
@@ -1,51 +1,86 @@
 # SPDX-License-Identifier: GPL-2.0
-PREFIX		?= /usr
-DESTDIR		?=
+#
+# Copyright (c) 2013, Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# Authors:
+#	 Todd Brandt <todd.e.brandt@linux.intel.com>
+
+# Prefix to the directories we're installing to
+DESTDIR ?=
+
+# Directory definitions. These are default and most probably
+# do not need to be changed. Please note that DESTDIR is
+# added in front of any of them
+
+BINDIR ?=	/usr/bin
+MANDIR ?=	/usr/share/man
+LIBDIR ?=	/usr/lib
+
+# Toolchain: what tools do we use, and what options do they need:
+INSTALL = /usr/bin/install
+INSTALL_DATA  = ${INSTALL} -m 644
 
 all:
 	@echo "Nothing to build"
 
 install : uninstall
-	install -d  $(DESTDIR)$(PREFIX)/lib/pm-graph
-	install sleepgraph.py $(DESTDIR)$(PREFIX)/lib/pm-graph
-	install bootgraph.py $(DESTDIR)$(PREFIX)/lib/pm-graph
-	install -d  $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/cgskip.txt $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/freeze-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/freeze.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/freeze-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/standby-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/standby.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/standby-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/suspend-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/suspend.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/suspend-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-	install -m 644 config/suspend-x2-proc.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config
-
-	install -d  $(DESTDIR)$(PREFIX)/bin
-	ln -s ../lib/pm-graph/bootgraph.py $(DESTDIR)$(PREFIX)/bin/bootgraph
-	ln -s ../lib/pm-graph/sleepgraph.py $(DESTDIR)$(PREFIX)/bin/sleepgraph
-
-	install -d  $(DESTDIR)$(PREFIX)/share/man/man8
-	install bootgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8
-	install sleepgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8
+	$(INSTALL) -d  $(DESTDIR)$(LIBDIR)/pm-graph
+	$(INSTALL) sleepgraph.py $(DESTDIR)$(LIBDIR)/pm-graph
+	$(INSTALL) bootgraph.py $(DESTDIR)$(LIBDIR)/pm-graph
+	$(INSTALL) -d  $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/cgskip.txt $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/freeze-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/freeze.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/freeze-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/standby-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/standby.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/standby-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/suspend-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/suspend.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/suspend-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+	$(INSTALL_DATA) config/suspend-x2-proc.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config
+
+	$(INSTALL) -d  $(DESTDIR)$(BINDIR)
+	ln -s ../lib/pm-graph/bootgraph.py $(DESTDIR)$(BINDIR)/bootgraph
+	ln -s ../lib/pm-graph/sleepgraph.py $(DESTDIR)$(BINDIR)/sleepgraph
+
+	$(INSTALL) -d  $(DESTDIR)$(MANDIR)/man8
+	$(INSTALL) bootgraph.8 $(DESTDIR)$(MANDIR)/man8
+	$(INSTALL) sleepgraph.8 $(DESTDIR)$(MANDIR)/man8
 
 uninstall :
-	rm -f $(DESTDIR)$(PREFIX)/share/man/man8/bootgraph.8
-	rm -f $(DESTDIR)$(PREFIX)/share/man/man8/sleepgraph.8
+	rm -f $(DESTDIR)$(MANDIR)/man8/bootgraph.8
+	rm -f $(DESTDIR)$(MANDIR)/man8/sleepgraph.8
 
-	rm -f $(DESTDIR)$(PREFIX)/bin/bootgraph
-	rm -f $(DESTDIR)$(PREFIX)/bin/sleepgraph
+	rm -f $(DESTDIR)$(BINDIR)/bootgraph
+	rm -f $(DESTDIR)$(BINDIR)/sleepgraph
 
-	rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/config/*
-	if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph/config ] ; then \
-		rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph/config; \
+	rm -f $(DESTDIR)$(LIBDIR)/pm-graph/config/*
+	if [ -d $(DESTDIR)$(LIBDIR)/pm-graph/config ] ; then \
+		rmdir $(DESTDIR)$(LIBDIR)/pm-graph/config; \
 	fi;
-	rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__/*
-	if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__ ] ; then \
-		rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__; \
+	rm -f $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__/*
+	if [ -d $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__ ] ; then \
+		rmdir $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__; \
 	fi;
-	rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/*
-	if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph ] ; then \
-		rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph; \
+	rm -f $(DESTDIR)$(LIBDIR)/pm-graph/*
+	if [ -d $(DESTDIR)$(LIBDIR)/pm-graph ] ; then \
+		rmdir $(DESTDIR)$(LIBDIR)/pm-graph; \
 	fi;
+
+help:
+	@echo  'Building targets:'
+	@echo  '  all		  - Nothing to build'
+	@echo  '  install	  - Install the program and create necessary directories'
+	@echo  '  uninstall	  - Remove installed files and directories'
+
+.PHONY: all install uninstall help
-- 
cgit v1.2.3


From 70b27c756f95156f51b1e28ec7d1b16a539d5487 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Sep 2024 22:06:04 -0700
Subject: perf parse-events: Add default_breakpoint_len helper

The default breakpoint length is "sizeof(long)" however this is
incorrect on platforms like Aarch64 where sizeof(long) is 8 but the
breakpoint length is 4. Add a helper function that can be used to
determine the correct breakpoint length, in this change it just
returns the existing default sizeof(long) value.

Use the helper in the bp_account test so that, when modifying the
event from a watchpoint to a breakpoint, the breakpoint length is
appropriate for the architecture and not just sizeof(long).

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Link: https://lore.kernel.org/r/20240904050606.752788-5-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/bp_account.c         | 4 +++-
 tools/perf/tests/bp_signal.c          | 3 ++-
 tools/perf/tests/bp_signal_overflow.c | 3 ++-
 tools/perf/tests/parse-events.c       | 3 ++-
 tools/perf/util/parse-events.c        | 7 ++++++-
 tools/perf/util/parse-events.h        | 2 ++
 6 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/bp_account.c b/tools/perf/tests/bp_account.c
index 6f921db33cf9..4cb7d486b5c1 100644
--- a/tools/perf/tests/bp_account.c
+++ b/tools/perf/tests/bp_account.c
@@ -16,6 +16,7 @@
 #include "tests.h"
 #include "debug.h"
 #include "event.h"
+#include "parse-events.h"
 #include "../perf-sys.h"
 #include "cloexec.h"
 
@@ -50,7 +51,7 @@ static int __event(bool is_x, void *addr, struct perf_event_attr *attr)
 	attr->config = 0;
 	attr->bp_type = is_x ? HW_BREAKPOINT_X : HW_BREAKPOINT_W;
 	attr->bp_addr = (unsigned long) addr;
-	attr->bp_len = sizeof(long);
+	attr->bp_len = is_x ? default_breakpoint_len() : sizeof(long);
 
 	attr->sample_period = 1;
 	attr->sample_type = PERF_SAMPLE_IP;
@@ -92,6 +93,7 @@ static int bp_accounting(int wp_cnt, int share)
 	attr_mod = attr;
 	attr_mod.bp_type = HW_BREAKPOINT_X;
 	attr_mod.bp_addr = (unsigned long) test_function;
+	attr_mod.bp_len = default_breakpoint_len();
 
 	ret = ioctl(fd[0], PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &attr_mod);
 	TEST_ASSERT_VAL("failed to modify wp\n", ret == 0);
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index 1f2908f02389..3faeb5b6fe0b 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -26,6 +26,7 @@
 #include "tests.h"
 #include "debug.h"
 #include "event.h"
+#include "parse-events.h"
 #include "perf-sys.h"
 #include "cloexec.h"
 
@@ -111,7 +112,7 @@ static int __event(bool is_x, void *addr, int sig)
 	pe.config = 0;
 	pe.bp_type = is_x ? HW_BREAKPOINT_X : HW_BREAKPOINT_W;
 	pe.bp_addr = (unsigned long) addr;
-	pe.bp_len = sizeof(long);
+	pe.bp_len = is_x ? default_breakpoint_len() : sizeof(long);
 
 	pe.sample_period = 1;
 	pe.sample_type = PERF_SAMPLE_IP;
diff --git a/tools/perf/tests/bp_signal_overflow.c b/tools/perf/tests/bp_signal_overflow.c
index 4e897c2cf26b..ee560e156be6 100644
--- a/tools/perf/tests/bp_signal_overflow.c
+++ b/tools/perf/tests/bp_signal_overflow.c
@@ -25,6 +25,7 @@
 #include "tests.h"
 #include "debug.h"
 #include "event.h"
+#include "parse-events.h"
 #include "../perf-sys.h"
 #include "cloexec.h"
 
@@ -88,7 +89,7 @@ static int test__bp_signal_overflow(struct test_suite *test __maybe_unused, int
 	pe.config = 0;
 	pe.bp_type = HW_BREAKPOINT_X;
 	pe.bp_addr = (unsigned long) test_function;
-	pe.bp_len = sizeof(long);
+	pe.bp_len = default_breakpoint_len();
 
 	pe.sample_period = THRESHOLD;
 	pe.sample_type = PERF_SAMPLE_IP;
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index edc2adcf1bae..639e65a9bf61 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -5,6 +5,7 @@
 #include <api/fs/fs.h>
 #include "tests.h"
 #include "debug.h"
+#include "parse-events.h"
 #include "pmu.h"
 #include "pmus.h"
 #include <dirent.h>
@@ -262,7 +263,7 @@ static int test__checkevent_breakpoint_x(struct evlist *evlist)
 	TEST_ASSERT_VAL("wrong config", test_config(evsel, 0));
 	TEST_ASSERT_VAL("wrong bp_type",
 			HW_BREAKPOINT_X == evsel->core.attr.bp_type);
-	TEST_ASSERT_VAL("wrong bp_len", sizeof(long) == evsel->core.attr.bp_len);
+	TEST_ASSERT_VAL("wrong bp_len", default_breakpoint_len() == evsel->core.attr.bp_len);
 	return TEST_OK;
 }
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 9c0ce01684c3..8c837c8a6c8c 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -671,6 +671,11 @@ static int add_tracepoint_multi_sys(struct parse_events_state *parse_state,
 }
 #endif /* HAVE_LIBTRACEEVENT */
 
+size_t default_breakpoint_len(void)
+{
+	return sizeof(long);
+}
+
 static int
 parse_breakpoint_type(const char *type, struct perf_event_attr *attr)
 {
@@ -729,7 +734,7 @@ int parse_events_add_breakpoint(struct parse_events_state *parse_state,
 	/* Provide some defaults if len is not specified */
 	if (!len) {
 		if (attr.bp_type == HW_BREAKPOINT_X)
-			len = sizeof(long);
+			len = default_breakpoint_len();
 		else
 			len = HW_BREAKPOINT_LEN_4;
 	}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index b735cd9e0acf..68bfea9ffa70 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -286,4 +286,6 @@ static inline bool is_sdt_event(char *str __maybe_unused)
 }
 #endif /* HAVE_LIBELF_SUPPORT */
 
+size_t default_breakpoint_len(void);
+
 #endif /* __PERF_PARSE_EVENTS_H */
-- 
cgit v1.2.3


From fa6cc3f932584a5fb5eedd2261fdf0058e1d8828 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Sep 2024 22:06:05 -0700
Subject: perf parse-events: Vary default_breakpoint_len on i386 and arm64

On arm64 the breakpoint length should be 4-bytes but 8-bytes is
tolerated as perf passes that as sizeof(long). Just pass the correct
value.

On i386 the sizeof(long) check in the kernel needs to match the
kernel's long size. Check using an environment (uname checks) whether
4 or 8 bytes needs to be passed. Cache the value in a static.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Link: https://lore.kernel.org/r/20240904050606.752788-6-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 8c837c8a6c8c..f5eb1af8302c 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -8,6 +8,7 @@
 #include <sys/ioctl.h>
 #include <sys/param.h>
 #include "term.h"
+#include "env.h"
 #include "evlist.h"
 #include "evsel.h"
 #include <subcmd/parse-options.h>
@@ -673,7 +674,22 @@ static int add_tracepoint_multi_sys(struct parse_events_state *parse_state,
 
 size_t default_breakpoint_len(void)
 {
+#if defined(__i386__)
+	static int len;
+
+	if (len == 0) {
+		struct perf_env env = {};
+
+		perf_env__init(&env);
+		len = perf_env__kernel_is_64_bit(&env) ? sizeof(u64) : sizeof(long);
+		perf_env__exit(&env);
+	}
+	return len;
+#elif defined(__aarch64__)
+	return 4;
+#else
 	return sizeof(long);
+#endif
 }
 
 static int
-- 
cgit v1.2.3


From 1a5efc9e13f357abc396dbf445b25d08914c8060 Mon Sep 17 00:00:00 2001
From: Aditya Gupta <adityag@linux.ibm.com>
Date: Wed, 4 Sep 2024 11:48:30 +0530
Subject: libsubcmd: Don't free the usage string

Currently, commands which depend on 'parse_options_subcommand()' don't
show the usage string, and instead show '(null)'

    $ ./perf sched
	Usage: (null)

    -D, --dump-raw-trace  dump raw trace in ASCII
    -f, --force           don't complain, do it
    -i, --input <file>    input file name
    -v, --verbose         be more verbose (show symbol address, etc)

'parse_options_subcommand()' is generally expected to initialise the usage
string, with information in the passed 'subcommands[]' array

This behaviour was changed in:

  230a7a71f92212e7 ("libsubcmd: Fix parse-options memory leak")

Where the generated usage string is deallocated, and usage[0] string is
reassigned as NULL.

As discussed in [1], free the allocated usage string in the main
function itself, and don't reset usage string to NULL in
parse_options_subcommand

With this change, the behaviour is restored.

    $ ./perf sched
        Usage: perf sched [<options>] {record|latency|map|replay|script|timehist}

           -D, --dump-raw-trace  dump raw trace in ASCII
           -f, --force           don't complain, do it
           -i, --input <file>    input file name
           -v, --verbose         be more verbose (show symbol address, etc)

[1]: https://lore.kernel.org/linux-perf-users/htq5vhx6piet4nuq2mmhk7fs2bhfykv52dbppwxmo3s7du2odf@styd27tioc6e/

Fixes: 230a7a71f92212e7 ("libsubcmd: Fix parse-options memory leak")
Suggested-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Aditya Gupta <adityag@linux.ibm.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240904061836.55873-2-adityag@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/subcmd/parse-options.c | 8 +++-----
 tools/perf/builtin-kmem.c        | 2 ++
 tools/perf/builtin-kvm.c         | 3 +++
 tools/perf/builtin-kwork.c       | 3 +++
 tools/perf/builtin-lock.c        | 3 +++
 tools/perf/builtin-mem.c         | 3 +++
 tools/perf/builtin-sched.c       | 3 +++
 7 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c
index 4b60ec03b0bb..eb896d30545b 100644
--- a/tools/lib/subcmd/parse-options.c
+++ b/tools/lib/subcmd/parse-options.c
@@ -633,10 +633,11 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 			const char *const subcommands[], const char *usagestr[], int flags)
 {
 	struct parse_opt_ctx_t ctx;
-	char *buf = NULL;
 
 	/* build usage string if it's not provided */
 	if (subcommands && !usagestr[0]) {
+		char *buf = NULL;
+
 		astrcatf(&buf, "%s %s [<options>] {", subcmd_config.exec_name, argv[0]);
 
 		for (int i = 0; subcommands[i]; i++) {
@@ -678,10 +679,7 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 			astrcatf(&error_buf, "unknown switch `%c'", *ctx.opt);
 		usage_with_options(usagestr, options);
 	}
-	if (buf) {
-		usagestr[0] = NULL;
-		free(buf);
-	}
+
 	return parse_options_end(&ctx);
 }
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index b3cbac40b8c7..a756147e2eec 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -2057,6 +2057,8 @@ int cmd_kmem(int argc, const char **argv)
 
 out_delete:
 	perf_session__delete(session);
+	/* free usage string allocated by parse_options_subcommand */
+	free((void *)kmem_usage[0]);
 
 	return ret;
 }
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 692267b1b7e8..55ea17c5ff02 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -2184,5 +2184,8 @@ int cmd_kvm(int argc, const char **argv)
 	else
 		usage_with_options(kvm_usage, kvm_options);
 
+	/* free usage string allocated by parse_options_subcommand */
+	free((void *)kvm_usage[0]);
+
 	return 0;
 }
diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c
index 6a4281b8fd10..c1daf82c9b92 100644
--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -2519,5 +2519,8 @@ int cmd_kwork(int argc, const char **argv)
 	} else
 		usage_with_options(kwork_usage, kwork_options);
 
+	/* free usage string allocated by parse_options_subcommand */
+	free((void *)kwork_usage[0]);
+
 	return 0;
 }
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 2c216427e929..062e2b56a2ab 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -2712,6 +2712,9 @@ int cmd_lock(int argc, const char **argv)
 		usage_with_options(lock_usage, lock_options);
 	}
 
+	/* free usage string allocated by parse_options_subcommand */
+	free((void *)lock_usage[0]);
+
 	zfree(&lockhash_table);
 	return rc;
 }
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 18e5f9a0ddc2..bb38bb5a1c26 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -546,5 +546,8 @@ int cmd_mem(int argc, const char **argv)
 	else
 		usage_with_options(mem_usage, mem_options);
 
+	/* free usage string allocated by parse_options_subcommand */
+	free((void *)mem_usage[0]);
+
 	return 0;
 }
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0a7b2b2acd56..5981cc51abc8 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -3954,5 +3954,8 @@ int cmd_sched(int argc, const char **argv)
 		usage_with_options(sched_usage, sched_options);
 	}
 
+	/* free usage string allocated by parse_options_subcommand */
+	free((void *)sched_usage[0]);
+
 	return 0;
 }
-- 
cgit v1.2.3


From 98ad0b77323ce8c216e5a505fda6b2ea53299231 Mon Sep 17 00:00:00 2001
From: Aditya Gupta <adityag@linux.ibm.com>
Date: Wed, 4 Sep 2024 11:48:31 +0530
Subject: perf check: Introduce 'check' subcommand

Currently the presence of a feature is checked with a combination of
perf version --build-options and greps, such as:

    perf version --build-options | grep " on .* HAVE_FEATURE"

Instead of this, introduce a subcommand "perf check feature", with which
scripts can test for presence of a feature, such as:

    perf check feature HAVE_FEATURE

'perf check feature' command is expected to have exit status of 0 if
feature is built-in, and 1 if it's not built-in or if feature is not known.

Multiple features can also be passed as a comma-separated list, in which
case the exit status will be 1 only if all of the passed features are
built-in. For example, with below command, it will have exit status of 0
only if both libtraceevent and bpf are enabled, else 1 in all other cases

    perf check feature libtraceevent,bpf

The arguments are case-insensitive.
An array 'supported_features' has also been introduced that can be used by
other commands like 'perf version --build-options', so that new features
can be added in one place, with the array

Committer testing:

  $ perf check feature libtraceevent,bpf
           libtraceevent: [ on  ]  # HAVE_LIBTRACEEVENT
                     bpf: [ on  ]  # HAVE_LIBBPF_SUPPORT
  $ perf check feature libtraceevent
           libtraceevent: [ on  ]  # HAVE_LIBTRACEEVENT
  $ perf check feature bpf
                     bpf: [ on  ]  # HAVE_LIBBPF_SUPPORT
  $ perf check -q feature bpf && echo "BPF support is present"
  BPF support is present
  $ perf check -q feature Bogus && echo "Bogus support is present"
  $

Reviewed-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Signed-off-by: Aditya Gupta <adityag@linux.ibm.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240904061836.55873-3-adityag@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Build                        |   1 +
 tools/perf/Documentation/perf-check.txt |  82 +++++++++++++++
 tools/perf/builtin-check.c              | 180 ++++++++++++++++++++++++++++++++
 tools/perf/builtin.h                    |  17 +++
 tools/perf/perf.c                       |   1 +
 5 files changed, 281 insertions(+)
 create mode 100644 tools/perf/Documentation/perf-check.txt
 create mode 100644 tools/perf/builtin-check.c

(limited to 'tools')

diff --git a/tools/perf/Build b/tools/perf/Build
index 1d4957957d75..3e486f7df94b 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -1,5 +1,6 @@
 perf-bench-y += builtin-bench.o
 perf-y += builtin-annotate.o
+perf-y += builtin-check.o
 perf-y += builtin-config.o
 perf-y += builtin-diff.o
 perf-y += builtin-evlist.o
diff --git a/tools/perf/Documentation/perf-check.txt b/tools/perf/Documentation/perf-check.txt
new file mode 100644
index 000000000000..b081514d240a
--- /dev/null
+++ b/tools/perf/Documentation/perf-check.txt
@@ -0,0 +1,82 @@
+perf-check(1)
+===============
+
+NAME
+----
+perf-check - check if features are present in perf
+
+SYNOPSIS
+--------
+[verse]
+'perf check' [<options>]
+'perf check' {feature <feature_list>} [<options>]
+
+DESCRIPTION
+-----------
+With no subcommands given, 'perf check' command just prints the command
+usage on the standard output.
+
+If the subcommand 'feature' is used, then status of feature is printed
+on the standard output (unless '-q' is also passed), ie. whether it is
+compiled-in/built-in or not.
+Also, 'perf check feature' returns with exit status 0 if the feature
+is built-in, otherwise returns with exit status 1.
+
+SUBCOMMANDS
+-----------
+
+feature::
+
+        Print whether feature(s) is compiled-in or not, and also returns with an
+        exit status of 0, if passed feature(s) are compiled-in, else 1.
+
+        It expects a feature list as an argument. There can be a single feature
+        name/macro, or multiple features can also be passed as a comma-separated
+        list, in which case the exit status will be 0 only if all of the passed
+        features are compiled-in.
+
+        The feature names/macros are case-insensitive.
+
+        Example Usage:
+                perf check feature libtraceevent
+                perf check feature HAVE_LIBTRACEEVENT
+                perf check feature libtraceevent,bpf
+
+        Supported feature names/macro:
+                aio                     /  HAVE_AIO_SUPPORT
+                bpf                     /  HAVE_LIBBPF_SUPPORT
+                bpf_skeletons           /  HAVE_BPF_SKEL
+                debuginfod              /  HAVE_DEBUGINFOD_SUPPORT
+                dwarf                   /  HAVE_DWARF_SUPPORT
+                dwarf_getlocations      /  HAVE_DWARF_GETLOCATIONS_SUPPORT
+                dwarf-unwind-support    /  HAVE_DWARF_UNWIND_SUPPORT
+                get_cpuid               /  HAVE_AUXTRACE_SUPPORT
+                libaudit                /  HAVE_LIBAUDIT_SUPPORT
+                libbfd                  /  HAVE_LIBBFD_SUPPORT
+                libcapstone             /  HAVE_LIBCAPSTONE_SUPPORT
+                libcrypto               /  HAVE_LIBCRYPTO_SUPPORT
+                libdw-dwarf-unwind      /  HAVE_DWARF_SUPPORT
+                libelf                  /  HAVE_LIBELF_SUPPORT
+                libnuma                 /  HAVE_LIBNUMA_SUPPORT
+                libopencsd              /  HAVE_CSTRACE_SUPPORT
+                libperl                 /  HAVE_LIBPERL_SUPPORT
+                libpfm4                 /  HAVE_LIBPFM
+                libpython               /  HAVE_LIBPYTHON_SUPPORT
+                libslang                /  HAVE_SLANG_SUPPORT
+                libtraceevent           /  HAVE_LIBTRACEEVENT
+                libunwind               /  HAVE_LIBUNWIND_SUPPORT
+                lzma                    /  HAVE_LZMA_SUPPORT
+                numa_num_possible_cpus  /  HAVE_LIBNUMA_SUPPORT
+                syscall_table           /  HAVE_SYSCALL_TABLE_SUPPORT
+                zlib                    /  HAVE_ZLIB_SUPPORT
+                zstd                    /  HAVE_ZSTD_SUPPORT
+
+OPTIONS
+-------
+-q::
+--quiet::
+        Do not print any messages or warnings
+
+        This can be used along with subcommands such as 'perf check feature'
+        to hide unnecessary output in test scripts, eg.
+        'perf check feature --quiet libtraceevent'
diff --git a/tools/perf/builtin-check.c b/tools/perf/builtin-check.c
new file mode 100644
index 000000000000..16a04b5456d9
--- /dev/null
+++ b/tools/perf/builtin-check.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "builtin.h"
+#include "color.h"
+#include "util/debug.h"
+#include "util/header.h"
+#include <tools/config.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <subcmd/parse-options.h>
+
+static const char * const check_subcommands[] = { "feature", NULL };
+static struct option check_options[] = {
+	OPT_BOOLEAN('q', "quiet", &quiet, "do not show any warnings or messages"),
+	OPT_END()
+};
+static struct option check_feature_options[] = { OPT_PARENT(check_options) };
+
+static const char *check_usage[] = { NULL, NULL };
+static const char *check_feature_usage[] = {
+	"perf check feature <feature_list>",
+	NULL
+};
+
+struct feature_status supported_features[] = {
+	FEATURE_STATUS("aio", HAVE_AIO_SUPPORT),
+	FEATURE_STATUS("bpf", HAVE_LIBBPF_SUPPORT),
+	FEATURE_STATUS("bpf_skeletons", HAVE_BPF_SKEL),
+	FEATURE_STATUS("debuginfod", HAVE_DEBUGINFOD_SUPPORT),
+	FEATURE_STATUS("dwarf", HAVE_DWARF_SUPPORT),
+	FEATURE_STATUS("dwarf_getlocations", HAVE_DWARF_GETLOCATIONS_SUPPORT),
+	FEATURE_STATUS("dwarf-unwind-support", HAVE_DWARF_UNWIND_SUPPORT),
+	FEATURE_STATUS("get_cpuid", HAVE_AUXTRACE_SUPPORT),
+	FEATURE_STATUS("libaudit", HAVE_LIBAUDIT_SUPPORT),
+	FEATURE_STATUS("libbfd", HAVE_LIBBFD_SUPPORT),
+	FEATURE_STATUS("libcapstone", HAVE_LIBCAPSTONE_SUPPORT),
+	FEATURE_STATUS("libcrypto", HAVE_LIBCRYPTO_SUPPORT),
+	FEATURE_STATUS("libdw-dwarf-unwind", HAVE_DWARF_SUPPORT),
+	FEATURE_STATUS("libelf", HAVE_LIBELF_SUPPORT),
+	FEATURE_STATUS("libnuma", HAVE_LIBNUMA_SUPPORT),
+	FEATURE_STATUS("libopencsd", HAVE_CSTRACE_SUPPORT),
+	FEATURE_STATUS("libperl", HAVE_LIBPERL_SUPPORT),
+	FEATURE_STATUS("libpfm4", HAVE_LIBPFM),
+	FEATURE_STATUS("libpython", HAVE_LIBPYTHON_SUPPORT),
+	FEATURE_STATUS("libslang", HAVE_SLANG_SUPPORT),
+	FEATURE_STATUS("libtraceevent", HAVE_LIBTRACEEVENT),
+	FEATURE_STATUS("libunwind", HAVE_LIBUNWIND_SUPPORT),
+	FEATURE_STATUS("lzma", HAVE_LZMA_SUPPORT),
+	FEATURE_STATUS("numa_num_possible_cpus", HAVE_LIBNUMA_SUPPORT),
+	FEATURE_STATUS("syscall_table", HAVE_SYSCALL_TABLE_SUPPORT),
+	FEATURE_STATUS("zlib", HAVE_ZLIB_SUPPORT),
+	FEATURE_STATUS("zstd", HAVE_ZSTD_SUPPORT),
+
+	/* this should remain at end, to know the array end */
+	FEATURE_STATUS(NULL, _)
+};
+
+static void on_off_print(const char *status)
+{
+	printf("[ ");
+
+	if (!strcmp(status, "OFF"))
+		color_fprintf(stdout, PERF_COLOR_RED, "%-3s", status);
+	else
+		color_fprintf(stdout, PERF_COLOR_GREEN, "%-3s", status);
+
+	printf(" ]");
+}
+
+/* Helper function to print status of a feature along with name/macro */
+static void status_print(const char *name, const char *macro,
+			 const char *status)
+{
+	printf("%22s: ", name);
+	on_off_print(status);
+	printf("  # %s\n", macro);
+}
+
+#define STATUS(feature)                                           \
+do {                                                              \
+	if (feature.is_builtin)                                   \
+		status_print(feature.name, feature.macro, "on");  \
+	else                                                      \
+		status_print(feature.name, feature.macro, "OFF"); \
+} while (0)
+
+/**
+ * check whether "feature" is built-in with perf
+ *
+ * returns:
+ *    0: NOT built-in or Feature not known
+ *    1: Built-in
+ */
+static int has_support(const char *feature)
+{
+	for (int i = 0; supported_features[i].name; ++i) {
+		if ((strcasecmp(feature, supported_features[i].name) == 0) ||
+		    (strcasecmp(feature, supported_features[i].macro) == 0)) {
+			if (!quiet)
+				STATUS(supported_features[i]);
+			return supported_features[i].is_builtin;
+		}
+	}
+
+	if (!quiet)
+		pr_err("Unknown feature '%s', please use 'perf version --build-options' to see which ones are available.\n", feature);
+
+	return 0;
+}
+
+
+/**
+ * Usage: 'perf check feature <feature_list>'
+ *
+ * <feature_list> can be a single feature name/macro, or a comma-separated list
+ * of feature names/macros
+ * eg. argument can be "libtraceevent" or "libtraceevent,bpf" etc
+ *
+ * In case of a comma-separated list, feature_enabled will be 1, only if
+ * all features passed in the string are supported
+ *
+ * Note that argv will get modified
+ */
+static int subcommand_feature(int argc, const char **argv)
+{
+	char *feature_list;
+	char *feature_name;
+	int feature_enabled;
+
+	argc = parse_options(argc, argv, check_feature_options,
+			check_feature_usage, 0);
+
+	if (!argc)
+		usage_with_options(check_feature_usage, check_feature_options);
+
+	if (argc > 1) {
+		pr_err("Too many arguments passed to 'perf check feature'\n");
+		return -1;
+	}
+
+	feature_enabled = 1;
+	/* feature_list is a non-const copy of 'argv[0]' */
+	feature_list = strdup(argv[0]);
+	if (!feature_list) {
+		pr_err("ERROR: failed to allocate memory for feature list\n");
+		return -1;
+	}
+
+	feature_name = strtok(feature_list, ",");
+
+	while (feature_name) {
+		feature_enabled &= has_support(feature_name);
+		feature_name = strtok(NULL, ",");
+	}
+
+	free(feature_list);
+
+	return !feature_enabled;
+}
+
+int cmd_check(int argc, const char **argv)
+{
+	argc = parse_options_subcommand(argc, argv, check_options,
+			check_subcommands, check_usage, 0);
+
+	if (!argc)
+		usage_with_options(check_usage, check_options);
+
+	if (strcmp(argv[0], "feature") == 0)
+		return subcommand_feature(argc, argv);
+
+	/* If no subcommand matched above, print usage help */
+	pr_err("Unknown subcommand: %s\n", argv[0]);
+	usage_with_options(check_usage, check_options);
+
+	/* free usage string allocated by parse_options_subcommand */
+	free((void *)check_usage[0]);
+
+	return 0;
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index f4375deabfa3..94f4b3769bf7 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -2,6 +2,22 @@
 #ifndef BUILTIN_H
 #define BUILTIN_H
 
+#include <stddef.h>
+#include <linux/compiler.h>
+#include <tools/config.h>
+
+struct feature_status {
+	const char *name;
+	const char *macro;
+	int is_builtin;
+};
+
+#define FEATURE_STATUS(name_, macro_) {    \
+	.name = name_,                     \
+	.macro = #macro_,                  \
+	.is_builtin = IS_BUILTIN(macro_) }
+
+extern struct feature_status supported_features[];
 struct cmdnames;
 
 void list_common_cmds_help(void);
@@ -11,6 +27,7 @@ int cmd_annotate(int argc, const char **argv);
 int cmd_bench(int argc, const char **argv);
 int cmd_buildid_cache(int argc, const char **argv);
 int cmd_buildid_list(int argc, const char **argv);
+int cmd_check(int argc, const char **argv);
 int cmd_config(int argc, const char **argv);
 int cmd_c2c(int argc, const char **argv);
 int cmd_diff(int argc, const char **argv);
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index bd3f80b5bb46..4def800f4089 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -52,6 +52,7 @@ static struct cmd_struct commands[] = {
 	{ "archive",	NULL,	0 },
 	{ "buildid-cache", cmd_buildid_cache, 0 },
 	{ "buildid-list", cmd_buildid_list, 0 },
+	{ "check",	cmd_check,	0 },
 	{ "config",	cmd_config,	0 },
 	{ "c2c",	cmd_c2c,	0 },
 	{ "diff",	cmd_diff,	0 },
-- 
cgit v1.2.3


From 9b2b9b66d532197d08efa1eba157fc484c3657bc Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Sep 2024 21:43:51 -0700
Subject: perf jevents: Add cpuid to model lookup command

When restricting jevents generated json lookup code with JEVENTS_MODEL
a list of models must be provided. Some builds don't know model names
but know cpuids. Add a command that can convert a cpuid to a model
using mapfile.csv files. This can be used with JEVENTS_MODEL like:

  $ make JEVENTS_MODEL=`./pmu-events/models.py x86 'GenuineIntel-6-8D-1,AuthenticAMD-26-1' pmu-events/arch/`

Committer testing:

  $ tools/perf/pmu-events/models.py x86 'GenuineIntel-6-8D-1,AuthenticAMD-26-1' tools/perf/pmu-events/arch/
  tigerlake,amdzen5
  $ perf stat -v sleep 1 |& head -1
  Using CPUID GenuineIntel-6-B7-1
  $ tools/perf/pmu-events/models.py x86 'GenuineIntel-6-B7-1' tools/perf/pmu-events/arch/
  alderlake
  $

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240904044351.712080-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/models.py | 73 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100755 tools/perf/pmu-events/models.py

(limited to 'tools')

diff --git a/tools/perf/pmu-events/models.py b/tools/perf/pmu-events/models.py
new file mode 100755
index 000000000000..8f727d29c952
--- /dev/null
+++ b/tools/perf/pmu-events/models.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+"""List model names from mapfile.csv files."""
+import argparse
+import csv
+import os
+import re
+from typing import List
+
+def main() -> None:
+    def dir_path(path: str) -> str:
+        """Validate path is a directory for argparse."""
+        if os.path.isdir(path):
+            return path
+        raise argparse.ArgumentTypeError(f'\'{path}\' is not a valid directory')
+
+    def find_archs(start_dir: str, arch: str) -> List[str]:
+        archs = []
+        for item in os.scandir(start_dir):
+            if not item.is_dir():
+                continue
+            if arch in (item.name, 'all'):
+                archs.append(item.name)
+
+        if len(archs) < 1:
+            raise IOError(f'Missing architecture directory \'{arch}\'')
+
+        return archs
+
+    def find_mapfiles(start_dir: str, archs: List[str]) -> List[str]:
+        result = []
+        for arch in archs:
+            for item in os.scandir(f'{start_dir}/{arch}'):
+                if item.is_dir():
+                    continue
+                if item.name == 'mapfile.csv':
+                    result.append(f'{start_dir}/{arch}/mapfile.csv')
+        return result
+
+    def find_cpuids(mapfiles: List[str], cpuids: str) -> List[str]:
+        result = []
+        for mapfile in mapfiles:
+            with open(mapfile, encoding='utf-8') as csvfile:
+                first = False
+                table = csv.reader(csvfile)
+                for row in table:
+                    if not first or len(row) == 0 or row[0].startswith('#'):
+                        first = True
+                        continue
+                    # Python regular expressions don't handle xdigit.
+                    regex = row[0].replace('[[:xdigit:]]', '[0-9a-fA-F]')
+                    for cpuid in cpuids.split(','):
+                        if re.match(regex, cpuid):
+                            result.append(row[2])
+        return result
+
+    ap = argparse.ArgumentParser()
+    ap.add_argument('arch', help='Architecture name like x86')
+    ap.add_argument('cpuid', default='all', help='List of cpuids to convert to model names')
+    ap.add_argument(
+        'starting_dir',
+        type=dir_path,
+        help='Root of tree containing architecture directories containing json files'
+    )
+    args = ap.parse_args()
+
+    archs = find_archs(args.starting_dir, args.arch)
+    mapfiles = find_mapfiles(args.starting_dir, archs)
+    models = find_cpuids(mapfiles, args.cpuid)
+    print(','.join(models))
+
+if __name__ == '__main__':
+    main()
-- 
cgit v1.2.3


From dce35dd27684640508ff330b8235c4671159743e Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Wed, 4 Sep 2024 15:35:50 +0800
Subject: spi: spidev_fdx: Fix the wrong format specifier

The unsigned int should use "%u" instead of "%d".

Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Link: https://patch.msgid.link/20240904073550.103618-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 tools/spi/spidev_fdx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/spi/spidev_fdx.c b/tools/spi/spidev_fdx.c
index 7d2a867cd4ae..bc9c4f6c3ba8 100644
--- a/tools/spi/spidev_fdx.c
+++ b/tools/spi/spidev_fdx.c
@@ -99,7 +99,7 @@ static void dumpstat(const char *name, int fd)
 		return;
 	}
 
-	printf("%s: spi mode 0x%x, %d bits %sper word, %d Hz max\n",
+	printf("%s: spi mode 0x%x, %d bits %sper word, %u Hz max\n",
 		name, mode, bits, lsb ? "(lsb first) " : "", speed);
 }
 
-- 
cgit v1.2.3


From 02baa0a2a677cf543899bc3eb43ed92caf4aba7a Mon Sep 17 00:00:00 2001
From: Yuan Chen <chenyuan@kylinos.cn>
Date: Tue, 3 Sep 2024 09:28:39 +0800
Subject: selftests/bpf: Fix procmap_query()'s params mismatch and compilation
 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the PROCMAP_QUERY is not defined, a compilation error occurs due to the
mismatch of the procmap_query()'s params, procmap_query() only be called in
the file where the function is defined, modify the params so they can match.

We get a warning when build samples/bpf:
    trace_helpers.c:252:5: warning: no previous prototype for ‘procmap_query’ [-Wmissing-prototypes]
      252 | int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags)
          |     ^~~~~~~~~~~~~
As this function is only used in the file, mark it as 'static'.

Fixes: 4e9e07603ecd ("selftests/bpf: make use of PROCMAP_QUERY ioctl if available")
Signed-off-by: Yuan Chen <chenyuan@kylinos.cn>
Link: https://lore.kernel.org/r/20240903012839.3178-1-chenyuan_fl@163.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/trace_helpers.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 1bfd881c0e07..2d742fdac6b9 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -249,7 +249,7 @@ out:
 #ifdef PROCMAP_QUERY
 int env_verbosity __weak = 0;
 
-int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags)
+static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags)
 {
 	char path_buf[PATH_MAX], build_id_buf[20];
 	struct procmap_query q;
@@ -293,7 +293,7 @@ int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, si
 	return 0;
 }
 #else
-int procmap_query(int fd, const void *addr, size_t *start, size_t *offset, int *flags)
+static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From d441734d0cfcebc5780bc1880d871f92debb588a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 19 Aug 2024 17:20:28 -0400
Subject: ktest.pl: Always warn on build warnings

If a warning happens at build, give a warning at the end:

  Build time:   1 minute 40 seconds
  Install time: 17 seconds
  Reboot time:  25 seconds

  *** WARNING found in build: 1 ***

  *******************************************
  *******************************************
  KTEST RESULT: TEST 1 SUCCESS!!!!   **
  *******************************************
  *******************************************

This way, even if the test isn't made to fail on warnings during the
build, a message is still displayed that warnings were found.

Link: https://lore.kernel.org/<20240819172028.3a7fae09@gandalf.local.home>
Acked-by: John 'Warthog9' Hawley (Tenstorrent) <warthog9@eaglescrag.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index eb31cd9c977b..c82b8d55dddb 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -222,6 +222,8 @@ my $install_time;
 my $reboot_time;
 my $test_time;
 
+my $warning_found = 0;
+
 my $pwd;
 my $dirname = $FindBin::Bin;
 
@@ -729,11 +731,18 @@ sub print_times {
 	show_time($test_time);
 	doprint "\n";
     }
+    if ($warning_found) {
+	doprint "\n*** WARNING";
+	doprint "S" if ($warning_found > 1);
+	doprint " found in build: $warning_found ***\n\n";
+    }
+
     # reset for iterations like bisect
     $build_time = 0;
     $install_time = 0;
     $reboot_time = 0;
     $test_time = 0;
+    $warning_found = 0;
 }
 
 sub get_mandatory_configs {
@@ -2460,8 +2469,6 @@ sub process_warning_line {
 # Returns 1 if OK
 #         0 otherwise
 sub check_buildlog {
-    return 1 if (!defined $warnings_file);
-
     my %warnings_list;
 
     # Failed builds should not reboot the target
@@ -2482,18 +2489,21 @@ sub check_buildlog {
 	close(IN);
     }
 
-    # If warnings file didn't exist, and WARNINGS_FILE exist,
-    # then we fail on any warning!
-
     open(IN, $buildlog) or dodie "Can't open $buildlog";
     while (<IN>) {
 	if (/$check_build_re/) {
 	    my $warning = process_warning_line $_;
 
 	    if (!defined $warnings_list{$warning}) {
-		fail "New warning found (not in $warnings_file)\n$_\n";
-		$no_reboot = $save_no_reboot;
-		return 0;
+		$warning_found++;
+
+		# If warnings file didn't exist, and WARNINGS_FILE exist,
+		# then we fail on any warning!
+		if (defined $warnings_file) {
+		    fail "New warning found (not in $warnings_file)\n$_\n";
+		    $no_reboot = $save_no_reboot;
+		    return 0;
+		}
 	    }
 	}
     }
-- 
cgit v1.2.3


From 2351e8c65404aabc433300b6bf90c7a37e8bbc4d Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Wed, 4 Sep 2024 13:55:30 -0400
Subject: ktest.pl: Avoid false positives with grub2 skip regex

Some distros have grub2 config files with the lines

    if [ x"${feature_menuentry_id}" = xy ]; then
      menuentry_id_option="--id"
    else
      menuentry_id_option=""
    fi

which match the skip regex defined for grub2 in get_grub_index():

    $skip = '^\s*menuentry';

These false positives cause the grub number to be higher than it
should be, and the wrong kernel can end up booting.

Grub documents the menuentry command with whitespace between it and the
title, so make the skip regex reflect this.

Link: https://lore.kernel.org/20240904175530.84175-1-daniel.m.jordan@oracle.com
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Acked-by: John 'Warthog9' Hawley (Tenstorrent) <warthog9@eaglescrag.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index c82b8d55dddb..dacad94e2be4 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -2056,7 +2056,7 @@ sub get_grub_index {
     } elsif ($reboot_type eq "grub2") {
 	$command = "cat $grub_file";
 	$target = '^\s*menuentry.*' . $grub_menu_qt;
-	$skip = '^\s*menuentry';
+	$skip = '^\s*menuentry\s';
 	$submenu = '^\s*submenu\s';
     } elsif ($reboot_type eq "grub2bls") {
 	$command = $grub_bls_get;
-- 
cgit v1.2.3


From 6cdd7750de408b08c9f77ac23723d48598fbeca7 Mon Sep 17 00:00:00 2001
From: Aditya Gupta <adityag@linux.ibm.com>
Date: Thu, 5 Sep 2024 00:31:28 +0530
Subject: perf version: Update --build-options to use 'supported_features'
 array

Now that the feature list has been duplicated in a global
'supported_features' array, use that array instead of manually checking
status of built-in features.

This helps in being consistent with commands such as 'perf check feature',
so commands can use the same array, and any new feature can be added at
one place, in the 'supported_features' array

Reviewed-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Signed-off-by: Aditya Gupta <adityag@linux.ibm.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240904190132.415212-4-adityag@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-version.c | 44 ++++++++------------------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c
index 4b252196de12..e149d96c6dc5 100644
--- a/tools/perf/builtin-version.c
+++ b/tools/perf/builtin-version.c
@@ -46,46 +46,18 @@ static void status_print(const char *name, const char *macro,
 	printf("  # %s\n", macro);
 }
 
-#define STATUS(__d, __m)				\
-do {							\
-	if (IS_BUILTIN(__d))				\
-		status_print(#__m, #__d, "on");		\
-	else						\
-		status_print(#__m, #__d, "OFF");	\
+#define STATUS(feature)                                   \
+do {                                                      \
+	if (feature.is_builtin)                               \
+		status_print(feature.name, feature.macro, "on");  \
+	else                                                  \
+		status_print(feature.name, feature.macro, "OFF"); \
 } while (0)
 
 static void library_status(void)
 {
-	STATUS(HAVE_DWARF_SUPPORT, dwarf);
-	STATUS(HAVE_DWARF_GETLOCATIONS_SUPPORT, dwarf_getlocations);
-#ifndef HAVE_SYSCALL_TABLE_SUPPORT
-	STATUS(HAVE_LIBAUDIT_SUPPORT, libaudit);
-#endif
-	STATUS(HAVE_SYSCALL_TABLE_SUPPORT, syscall_table);
-	STATUS(HAVE_LIBBFD_SUPPORT, libbfd);
-	STATUS(HAVE_DEBUGINFOD_SUPPORT, debuginfod);
-	STATUS(HAVE_LIBELF_SUPPORT, libelf);
-	STATUS(HAVE_LIBLLVM_SUPPORT, libllvm);
-	STATUS(HAVE_LIBNUMA_SUPPORT, libnuma);
-	STATUS(HAVE_LIBNUMA_SUPPORT, numa_num_possible_cpus);
-	STATUS(HAVE_LIBPERL_SUPPORT, libperl);
-	STATUS(HAVE_LIBPYTHON_SUPPORT, libpython);
-	STATUS(HAVE_SLANG_SUPPORT, libslang);
-	STATUS(HAVE_LIBCRYPTO_SUPPORT, libcrypto);
-	STATUS(HAVE_LIBUNWIND_SUPPORT, libunwind);
-	STATUS(HAVE_DWARF_SUPPORT, libdw-dwarf-unwind);
-	STATUS(HAVE_LIBCAPSTONE_SUPPORT, libcapstone);
-	STATUS(HAVE_ZLIB_SUPPORT, zlib);
-	STATUS(HAVE_LZMA_SUPPORT, lzma);
-	STATUS(HAVE_AUXTRACE_SUPPORT, get_cpuid);
-	STATUS(HAVE_LIBBPF_SUPPORT, bpf);
-	STATUS(HAVE_AIO_SUPPORT, aio);
-	STATUS(HAVE_ZSTD_SUPPORT, zstd);
-	STATUS(HAVE_LIBPFM, libpfm4);
-	STATUS(HAVE_LIBTRACEEVENT, libtraceevent);
-	STATUS(HAVE_BPF_SKEL, bpf_skeletons);
-	STATUS(HAVE_DWARF_UNWIND_SUPPORT, dwarf-unwind-support);
-	STATUS(HAVE_CSTRACE_SUPPORT, libopencsd);
+	for (int i = 0; supported_features[i].name; ++i)
+		STATUS(supported_features[i]);
 }
 
 int cmd_version(int argc, const char **argv)
-- 
cgit v1.2.3


From 8a028502b4124eb9e37b5216e440ebb9e187b0a1 Mon Sep 17 00:00:00 2001
From: Aditya Gupta <adityag@linux.ibm.com>
Date: Thu, 5 Sep 2024 00:31:29 +0530
Subject: perf tools test_task_analyzer.sh: Update to use 'perf check feature'

Currently we use output of 'perf version --build-options', to check
whether perf was built with libtraceevent support.

Instead, use 'perf check feature libtraceevent' to check for
libtraceevent support.

Reviewed-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Signed-off-by: Aditya Gupta <adityag@linux.ibm.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240904190132.415212-5-adityag@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/test_task_analyzer.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh
index cb02bf23e6a5..7d76fc63d995 100755
--- a/tools/perf/tests/shell/test_task_analyzer.sh
+++ b/tools/perf/tests/shell/test_task_analyzer.sh
@@ -55,8 +55,8 @@ find_str_or_fail() {
 
 # check if perf is compiled with libtraceevent support
 skip_no_probe_record_support() {
-	perf version --build-options | grep -q " OFF .* HAVE_LIBTRACEEVENT" && return 2
-	return 0
+	perf check feature -q libtraceevent && return 0
+	return 2
 }
 
 prepare_perf_data() {
-- 
cgit v1.2.3


From 512fcf7d9d7fb3751b03db45977b7fabaadfaecd Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 5 Sep 2024 00:31:30 +0530
Subject: perf tests probe_vfs_getname.sh: Update to use 'perf check feature'

In probe_vfs_getname.sh, current we use "perf record --dry-run"
to check for libtraceevent and skip the test if perf is not
build with libtraceevent. Change the check to use "perf check feature"
option

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240904190132.415212-6-adityag@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/lib/probe_vfs_getname.sh           | 4 ++--
 tools/perf/tests/shell/record+probe_libc_inet_pton.sh     | 5 ++++-
 tools/perf/tests/shell/record+script_probe_vfs_getname.sh | 5 ++++-
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/lib/probe_vfs_getname.sh b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
index b3802d9494b4..0606e693eb59 100644
--- a/tools/perf/tests/shell/lib/probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
@@ -32,7 +32,7 @@ skip_if_no_debuginfo() {
 # check if perf is compiled with libtraceevent support
 skip_no_probe_record_support() {
 	if [ $had_vfs_getname -eq 1 ] ; then
-		perf record --dry-run -e $1 2>&1 | grep "libtraceevent is necessary for tracepoint support" && return 2
-		return 1
+		perf check feature -q libtraceevent && return 1
+		return 2
 	fi
 }
diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
index 72c65570db37..f38c8ead0b03 100755
--- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
+++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
@@ -63,7 +63,10 @@ trace_libc_inet_pton_backtrace() {
 
 	# Check presence of libtraceevent support to run perf record
 	skip_no_probe_record_support "$event_name/$eventattr/"
-	[ $? -eq 2 ] && return 2
+	if [ $? -eq 2 ]; then
+		echo "WARN: Skipping test trace_libc_inet_pton_backtrace. No libtraceevent support."
+		return 2
+	fi
 
 	perf record -e $event_name/$eventattr/ -o $perf_data ping -6 -c 1 ::1 > /dev/null 2>&1
 	# check if perf data file got created in above step.
diff --git a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh
index 5eedbe29bba1..9a61928e3c9a 100755
--- a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh
@@ -21,7 +21,10 @@ record_open_file() {
 	echo "Recording open file:"
 	# Check presence of libtraceevent support to run perf record
 	skip_no_probe_record_support "probe:vfs_getname*"
-	[ $? -eq 2 ] && return 2
+	if [ $? -eq 2 ]; then
+		echo "WARN: Skipping test record_open_file. No libtraceevent support"
+		return 2
+	fi
 	perf record -o ${perfdata} -e probe:vfs_getname\* touch $file
 }
 
-- 
cgit v1.2.3


From 35439fe4e29bca5fe8da8ed3f8524ef98f3e7aae Mon Sep 17 00:00:00 2001
From: Aditya Gupta <adityag@linux.ibm.com>
Date: Thu, 5 Sep 2024 00:31:31 +0530
Subject: perf check: Fix inconsistencies in feature names

Fix two inconsistencies in feature names as discussed in [1]:

1. Rename "dwarf-unwind-support" to "dwarf-unwind"

2. 'get_cpuid' feature and 'HAVE_AUXTRACE_SUPPORT' names don't
   look related, change the feature name to 'auxtrace' to match the
   macro name, as 'get_cpuid' string is not used anywhere to check the
   feature presence

[1]: https://lore.kernel.org/linux-perf-users/ZoRw5we4HLSTZND6@x1/

Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Aditya Gupta <adityag@linux.ibm.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Disha Goel <disgoel@linux.vnet.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240904190132.415212-7-adityag@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-check.txt | 4 ++--
 tools/perf/builtin-check.c              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-check.txt b/tools/perf/Documentation/perf-check.txt
index b081514d240a..10f69fb6850b 100644
--- a/tools/perf/Documentation/perf-check.txt
+++ b/tools/perf/Documentation/perf-check.txt
@@ -49,8 +49,8 @@ feature::
                 debuginfod              /  HAVE_DEBUGINFOD_SUPPORT
                 dwarf                   /  HAVE_DWARF_SUPPORT
                 dwarf_getlocations      /  HAVE_DWARF_GETLOCATIONS_SUPPORT
-                dwarf-unwind-support    /  HAVE_DWARF_UNWIND_SUPPORT
-                get_cpuid               /  HAVE_AUXTRACE_SUPPORT
+                dwarf-unwind            /  HAVE_DWARF_UNWIND_SUPPORT
+                auxtrace                /  HAVE_AUXTRACE_SUPPORT
                 libaudit                /  HAVE_LIBAUDIT_SUPPORT
                 libbfd                  /  HAVE_LIBBFD_SUPPORT
                 libcapstone             /  HAVE_LIBCAPSTONE_SUPPORT
diff --git a/tools/perf/builtin-check.c b/tools/perf/builtin-check.c
index 16a04b5456d9..0b76b6e42b78 100644
--- a/tools/perf/builtin-check.c
+++ b/tools/perf/builtin-check.c
@@ -29,8 +29,8 @@ struct feature_status supported_features[] = {
 	FEATURE_STATUS("debuginfod", HAVE_DEBUGINFOD_SUPPORT),
 	FEATURE_STATUS("dwarf", HAVE_DWARF_SUPPORT),
 	FEATURE_STATUS("dwarf_getlocations", HAVE_DWARF_GETLOCATIONS_SUPPORT),
-	FEATURE_STATUS("dwarf-unwind-support", HAVE_DWARF_UNWIND_SUPPORT),
-	FEATURE_STATUS("get_cpuid", HAVE_AUXTRACE_SUPPORT),
+	FEATURE_STATUS("dwarf-unwind", HAVE_DWARF_UNWIND_SUPPORT),
+	FEATURE_STATUS("auxtrace", HAVE_AUXTRACE_SUPPORT),
 	FEATURE_STATUS("libaudit", HAVE_LIBAUDIT_SUPPORT),
 	FEATURE_STATUS("libbfd", HAVE_LIBBFD_SUPPORT),
 	FEATURE_STATUS("libcapstone", HAVE_LIBCAPSTONE_SUPPORT),
-- 
cgit v1.2.3


From 23457b37ec3f9bb373d43cca61db371303726a1e Mon Sep 17 00:00:00 2001
From: Feng Yang <yangfeng@kylinos.cn>
Date: Tue, 3 Sep 2024 15:25:59 +0800
Subject: selftests: bpf: Replace sizeof(arr)/sizeof(arr[0]) with ARRAY_SIZE

The ARRAY_SIZE macro is more compact and more formal in linux source.

Signed-off-by: Feng Yang <yangfeng@kylinos.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240903072559.292607-1-yangfeng59949@163.com
---
 tools/testing/selftests/bpf/prog_tests/fexit_stress.c                | 3 ++-
 tools/testing/selftests/bpf/prog_tests/log_buf.c                     | 5 +++--
 tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c        | 3 ++-
 .../selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c    | 3 ++-
 tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c    | 5 +++--
 tools/testing/selftests/bpf/prog_tests/tc_opts.c                     | 2 +-
 tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c         | 3 ++-
 tools/testing/selftests/bpf/progs/syscall.c                          | 3 ++-
 tools/testing/selftests/bpf/progs/test_rdonly_maps.c                 | 3 ++-
 tools/testing/selftests/bpf/progs/verifier_bits_iter.c               | 2 +-
 10 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
index 49b1ffc9af1f..14c91b6f1e83 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
+#include "bpf_util.h"
 
 void serial_test_fexit_stress(void)
 {
@@ -36,7 +37,7 @@ void serial_test_fexit_stress(void)
 	for (i = 0; i < bpf_max_tramp_links; i++) {
 		fexit_fd[i] = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL",
 					    trace_program,
-					    sizeof(trace_program) / sizeof(struct bpf_insn),
+					    ARRAY_SIZE(trace_program),
 					    &trace_opts);
 		if (!ASSERT_GE(fexit_fd[i], 0, "fexit load"))
 			goto out;
diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c
index 0f7ea4d7d9f6..46611040dec3 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_buf.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c
@@ -5,6 +5,7 @@
 #include <bpf/btf.h>
 
 #include "test_log_buf.skel.h"
+#include "bpf_util.h"
 
 static size_t libbpf_log_pos;
 static char libbpf_log_buf[1024 * 1024];
@@ -143,11 +144,11 @@ static void bpf_prog_load_log_buf(void)
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
-	const size_t good_prog_insn_cnt = sizeof(good_prog_insns) / sizeof(struct bpf_insn);
+	const size_t good_prog_insn_cnt = ARRAY_SIZE(good_prog_insns);
 	const struct bpf_insn bad_prog_insns[] = {
 		BPF_EXIT_INSN(),
 	};
-	size_t bad_prog_insn_cnt = sizeof(bad_prog_insns) / sizeof(struct bpf_insn);
+	size_t bad_prog_insn_cnt = ARRAY_SIZE(bad_prog_insns);
 	LIBBPF_OPTS(bpf_prog_load_opts, opts);
 	const size_t log_buf_sz = 1024 * 1024;
 	char *log_buf;
diff --git a/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c b/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c
index aa9f67eb1c95..bea05f78de5f 100644
--- a/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c
+++ b/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c
@@ -4,6 +4,7 @@
 #include <bpf/btf.h>
 #include "bpf/libbpf_internal.h"
 #include "cgroup_helpers.h"
+#include "bpf_util.h"
 
 static const char *module_name = "bpf_testmod";
 static const char *symbol_name = "bpf_fentry_shadow_test";
@@ -100,7 +101,7 @@ void test_module_fentry_shadow(void)
 		load_opts.attach_btf_obj_fd = btf_fd[i];
 		prog_fd[i] = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL",
 					   trace_program,
-					   sizeof(trace_program) / sizeof(struct bpf_insn),
+					   ARRAY_SIZE(trace_program),
 					   &load_opts);
 		if (!ASSERT_GE(prog_fd[i], 0, "bpf_prog_load"))
 			goto out;
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
index e2f1445b0e10..216b0dfac0fe 100644
--- a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
@@ -2,6 +2,7 @@
 
 #include <test_progs.h>
 #include <linux/nbd.h>
+#include "bpf_util.h"
 
 void test_raw_tp_writable_reject_nbd_invalid(void)
 {
@@ -25,7 +26,7 @@ void test_raw_tp_writable_reject_nbd_invalid(void)
 	);
 
 	bpf_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, NULL, "GPL v2",
-			       program, sizeof(program) / sizeof(struct bpf_insn),
+			       program, ARRAY_SIZE(program),
 			       &opts);
 	if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable load",
 		  "failed: %d errno %d\n", bpf_fd, errno))
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
index f4aa7dab4766..e3668058b7bb 100644
--- a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
@@ -2,6 +2,7 @@
 
 #include <test_progs.h>
 #include <linux/nbd.h>
+#include "bpf_util.h"
 
 /* NOTE: conflict with other tests. */
 void serial_test_raw_tp_writable_test_run(void)
@@ -24,7 +25,7 @@ void serial_test_raw_tp_writable_test_run(void)
 	);
 
 	int bpf_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, NULL, "GPL v2",
-				   trace_program, sizeof(trace_program) / sizeof(struct bpf_insn),
+				   trace_program, ARRAY_SIZE(trace_program),
 				   &trace_opts);
 	if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded",
 		  "failed: %d errno %d\n", bpf_fd, errno))
@@ -41,7 +42,7 @@ void serial_test_raw_tp_writable_test_run(void)
 	);
 
 	int filter_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL v2",
-				      skb_program, sizeof(skb_program) / sizeof(struct bpf_insn),
+				      skb_program, ARRAY_SIZE(skb_program),
 				      &skb_opts);
 	if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
 		  filter_fd, errno))
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_opts.c b/tools/testing/selftests/bpf/prog_tests/tc_opts.c
index 196abf223465..f77f604389aa 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_opts.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_opts.c
@@ -2384,7 +2384,7 @@ static int generate_dummy_prog(void)
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
-	const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn);
+	const size_t prog_insn_cnt = ARRAY_SIZE(prog_insns);
 	LIBBPF_OPTS(bpf_prog_load_opts, opts);
 	const size_t log_buf_sz = 256;
 	char log_buf[log_buf_sz];
diff --git a/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c b/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c
index 0adf8d9475cb..472f4f9fa95f 100644
--- a/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c
+++ b/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c
@@ -7,6 +7,7 @@
 #include "test_unpriv_bpf_disabled.skel.h"
 
 #include "cap_helpers.h"
+#include "bpf_util.h"
 
 /* Using CAP_LAST_CAP is risky here, since it can get pulled in from
  * an old /usr/include/linux/capability.h and be < CAP_BPF; as a result
@@ -146,7 +147,7 @@ static void test_unpriv_bpf_disabled_negative(struct test_unpriv_bpf_disabled *s
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
-	const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn);
+	const size_t prog_insn_cnt = ARRAY_SIZE(prog_insns);
 	LIBBPF_OPTS(bpf_prog_load_opts, load_opts);
 	struct bpf_map_info map_info = {};
 	__u32 map_info_len = sizeof(map_info);
diff --git a/tools/testing/selftests/bpf/progs/syscall.c b/tools/testing/selftests/bpf/progs/syscall.c
index 3d3cafdebe72..0f4dfb770c32 100644
--- a/tools/testing/selftests/bpf/progs/syscall.c
+++ b/tools/testing/selftests/bpf/progs/syscall.c
@@ -8,6 +8,7 @@
 #include <linux/btf.h>
 #include <string.h>
 #include <errno.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -119,7 +120,7 @@ int load_prog(struct args *ctx)
 	static __u64 value = 34;
 	static union bpf_attr prog_load_attr = {
 		.prog_type = BPF_PROG_TYPE_XDP,
-		.insn_cnt = sizeof(insns) / sizeof(insns[0]),
+		.insn_cnt = ARRAY_SIZE(insns),
 	};
 	int ret;
 
diff --git a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c
index fc8e8a34a3db..7035fb4d4165 100644
--- a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c
+++ b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c
@@ -4,6 +4,7 @@
 #include <linux/ptrace.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 const struct {
 	unsigned a[4];
@@ -64,7 +65,7 @@ int full_loop(struct pt_regs *ctx)
 {
 	/* prevent compiler to optimize everything out */
 	unsigned * volatile p = (void *)&rdonly_values.a;
-	int i = sizeof(rdonly_values.a) / sizeof(rdonly_values.a[0]);
+	int i = ARRAY_SIZE(rdonly_values.a);
 	unsigned iters = 0, sum = 0;
 
 	/* validate verifier can allow full loop as well */
diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
index 716113c2bce2..f4da4d508ddb 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
@@ -87,7 +87,7 @@ int bits_memalloc(void)
 	int *bit;
 
 	__builtin_memset(&data, 0xf0, sizeof(data)); /* 4 * 16 */
-	bpf_for_each(bits, bit, &data[0], sizeof(data) / sizeof(u64))
+	bpf_for_each(bits, bit, &data[0], ARRAY_SIZE(data))
 		nr++;
 	return nr;
 }
-- 
cgit v1.2.3


From 8195136669661fdfe54e9a8923c33b31c92fc1da Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 4 Sep 2024 10:24:59 -1000
Subject: sched_ext: Add cgroup support

Add sched_ext_ops operations to init/exit cgroups, and track task migrations
and config changes. A BPF scheduler may not implement or implement only
subset of cgroup features. The implemented features can be indicated using
%SCX_OPS_HAS_CGOUP_* flags. If cgroup configuration makes use of features
that are not implemented, a warning is triggered.

While a BPF scheduler is being enabled and disabled, relevant cgroup
operations are locked out using scx_cgroup_rwsem. This avoids situations
like task prep taking place while the task is being moved across cgroups,
making things easier for BPF schedulers.

v7: - cgroup interface file visibility toggling is dropped in favor just
      warning messages. Dynamically changing interface visiblity caused more
      confusion than helping.

v6: - Updated to reflect the removal of SCX_KF_SLEEPABLE.

    - Updated to use CONFIG_GROUP_SCHED_WEIGHT and fixes for
      !CONFIG_FAIR_GROUP_SCHED && CONFIG_EXT_GROUP_SCHED.

v5: - Flipped the locking order between scx_cgroup_rwsem and
      cpus_read_lock() to avoid locking order conflict w/ cpuset. Better
      documentation around locking.

    - sched_move_task() takes an early exit if the source and destination
      are identical. This triggered the warning in scx_cgroup_can_attach()
      as it left p->scx.cgrp_moving_from uncleared. Updated the cgroup
      migration path so that ops.cgroup_prep_move() is skipped for identity
      migrations so that its invocations always match ops.cgroup_move()
      one-to-one.

v4: - Example schedulers moved into their own patches.

    - Fix build failure when !CONFIG_CGROUP_SCHED, reported by Andrea Righi.

v3: - Make scx_example_pair switch all tasks by default.

    - Convert to BPF inline iterators.

    - scx_bpf_task_cgroup() is added to determine the current cgroup from
      CPU controller's POV. This allows BPF schedulers to accurately track
      CPU cgroup membership.

    - scx_example_flatcg added. This demonstrates flattened hierarchy
      implementation of CPU cgroup control and shows significant performance
      improvement when cgroups which are nested multiple levels are under
      competition.

v2: - Build fixes for different CONFIG combinations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
---
 include/linux/sched/ext.h                       |   3 +
 init/Kconfig                                    |   6 +
 kernel/sched/core.c                             |  67 ++-
 kernel/sched/ext.c                              | 519 +++++++++++++++++++++++-
 kernel/sched/ext.h                              |  22 +
 kernel/sched/sched.h                            |   5 +
 tools/sched_ext/include/scx/common.bpf.h        |   1 +
 tools/testing/selftests/sched_ext/maximal.bpf.c |  32 ++
 8 files changed, 636 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index db2a266113ac..dc646cca4fcf 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -188,6 +188,9 @@ struct sched_ext_entity {
 	bool			disallow;	/* reject switching into SCX */
 
 	/* cold fields */
+#ifdef CONFIG_EXT_GROUP_SCHED
+	struct cgroup		*cgrp_moving_from;
+#endif
 	/* must be the last field, see init_scx_entity() */
 	struct list_head	tasks_node;
 };
diff --git a/init/Kconfig b/init/Kconfig
index 84332d3594d0..8c11ed61ca67 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1055,6 +1055,12 @@ config RT_GROUP_SCHED
 	  realtime bandwidth for them.
 	  See Documentation/scheduler/sched-rt-group.rst for more information.
 
+config EXT_GROUP_SCHED
+	bool
+	depends on SCHED_CLASS_EXT && CGROUP_SCHED
+	select GROUP_SCHED_WEIGHT
+	default y
+
 endif #CGROUP_SCHED
 
 config SCHED_MM_CID
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d302115b1522..b01b63e78d5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8364,6 +8364,9 @@ void __init sched_init(void)
 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_EXT_GROUP_SCHED
+		root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+#endif /* CONFIG_EXT_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
@@ -8801,6 +8804,7 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
 	alloc_uclamp_sched_group(tg, parent);
 
 	return tg;
@@ -8928,6 +8932,7 @@ void sched_move_task(struct task_struct *tsk)
 		put_prev_task(rq, tsk);
 
 	sched_change_group(tsk, group);
+	scx_move_task(tsk);
 
 	if (queued)
 		enqueue_task(rq, tsk, queue_flags);
@@ -8965,6 +8970,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css->parent);
+	int ret;
+
+	ret = scx_tg_online(tg);
+	if (ret)
+		return ret;
 
 	if (parent)
 		sched_online_group(tg, parent);
@@ -8979,6 +8989,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 	return 0;
 }
 
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+
+	scx_tg_offline(tg);
+}
+
 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
@@ -8996,9 +9013,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 	sched_unregister_group(tg);
 }
 
-#ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct task_struct *task;
 	struct cgroup_subsys_state *css;
 
@@ -9006,9 +9023,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 		if (!sched_rt_can_attach(css_tg(css), task))
 			return -EINVAL;
 	}
-	return 0;
-}
 #endif
+	return scx_cgroup_can_attach(tset);
+}
 
 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 {
@@ -9017,6 +9034,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 
 	cgroup_taskset_for_each(task, css, tset)
 		sched_move_task(task);
+
+	scx_cgroup_finish_attach();
+}
+
+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+	scx_cgroup_cancel_attach(tset);
 }
 
 #ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -9196,15 +9220,25 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
 #ifdef CONFIG_GROUP_SCHED_WEIGHT
 static unsigned long tg_weight(struct task_group *tg)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	return scale_load_down(tg->shares);
+#else
+	return sched_weight_from_cgroup(tg->scx_weight);
+#endif
 }
 
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
 {
+	int ret;
+
 	if (shareval > scale_load_down(ULONG_MAX))
 		shareval = MAX_SHARES;
-	return sched_group_set_shares(css_tg(css), scale_load(shareval));
+	ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
+	if (!ret)
+		scx_group_set_weight(css_tg(css),
+				     sched_weight_to_cgroup(shareval));
+	return ret;
 }
 
 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
@@ -9595,7 +9629,12 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
 static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 				struct cftype *cft, s64 idle)
 {
-	return sched_group_set_idle(css_tg(css), idle);
+	int ret;
+
+	ret = sched_group_set_idle(css_tg(css), idle);
+	if (!ret)
+		scx_group_set_idle(css_tg(css), idle);
+	return ret;
 }
 #endif
 
@@ -9722,13 +9761,17 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cft, u64 cgrp_weight)
 {
 	unsigned long weight;
+	int ret;
 
 	if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
 		return -ERANGE;
 
 	weight = sched_weight_from_cgroup(cgrp_weight);
 
-	return sched_group_set_shares(css_tg(css), scale_load(weight));
+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+	if (!ret)
+		scx_group_set_weight(css_tg(css), cgrp_weight);
+	return ret;
 }
 
 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
@@ -9753,7 +9796,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
 				     struct cftype *cft, s64 nice)
 {
 	unsigned long weight;
-	int idx;
+	int idx, ret;
 
 	if (nice < MIN_NICE || nice > MAX_NICE)
 		return -ERANGE;
@@ -9762,7 +9805,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
 	idx = array_index_nospec(idx, 40);
 	weight = sched_prio_to_weight[idx];
 
-	return sched_group_set_shares(css_tg(css), scale_load(weight));
+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+	if (!ret)
+		scx_group_set_weight(css_tg(css),
+				     sched_weight_to_cgroup(weight));
+	return ret;
 }
 #endif /* CONFIG_GROUP_SCHED_WEIGHT */
 
@@ -9878,14 +9925,14 @@ static struct cftype cpu_files[] = {
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_online	= cpu_cgroup_css_online,
+	.css_offline	= cpu_cgroup_css_offline,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
 	.css_extra_stat_show = cpu_extra_stat_show,
 	.css_local_stat_show = cpu_local_stat_show,
-#ifdef CONFIG_RT_GROUP_SCHED
 	.can_attach	= cpu_cgroup_can_attach,
-#endif
 	.attach		= cpu_cgroup_attach,
+	.cancel_attach	= cpu_cgroup_cancel_attach,
 	.legacy_cftypes	= cpu_legacy_files,
 	.dfl_cftypes	= cpu_files,
 	.early_init	= true,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5fd2bfc01403..bba9d805dc2b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -116,10 +116,16 @@ enum scx_ops_flags {
 	 */
 	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
 
+	/*
+	 * CPU cgroup support flags
+	 */
+	SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16,	/* cpu.weight */
+
 	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
 				  SCX_OPS_ENQ_LAST |
 				  SCX_OPS_ENQ_EXITING |
-				  SCX_OPS_SWITCH_PARTIAL,
+				  SCX_OPS_SWITCH_PARTIAL |
+				  SCX_OPS_HAS_CGROUP_WEIGHT,
 };
 
 /* argument container for ops.init_task() */
@@ -129,6 +135,10 @@ struct scx_init_task_args {
 	 * to the scheduler transition path.
 	 */
 	bool			fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+	/* the cgroup the task is joining */
+	struct cgroup		*cgroup;
+#endif
 };
 
 /* argument container for ops.exit_task() */
@@ -137,6 +147,12 @@ struct scx_exit_task_args {
 	bool cancelled;
 };
 
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+	/* the weight of the cgroup [1..10000] */
+	u32			weight;
+};
+
 enum scx_cpu_preempt_reason {
 	/* next task is being scheduled by &sched_class_rt */
 	SCX_CPU_PREEMPT_RT,
@@ -501,6 +517,79 @@ struct sched_ext_ops {
 	 */
 	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+	/**
+	 * cgroup_init - Initialize a cgroup
+	 * @cgrp: cgroup being initialized
+	 * @args: init arguments, see the struct definition
+	 *
+	 * Either the BPF scheduler is being loaded or @cgrp created, initialize
+	 * @cgrp for sched_ext. This operation may block.
+	 *
+	 * Return 0 for success, -errno for failure. An error return while
+	 * loading will abort loading of the BPF scheduler. During cgroup
+	 * creation, it will abort the specific cgroup creation.
+	 */
+	s32 (*cgroup_init)(struct cgroup *cgrp,
+			   struct scx_cgroup_init_args *args);
+
+	/**
+	 * cgroup_exit - Exit a cgroup
+	 * @cgrp: cgroup being exited
+	 *
+	 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+	 * @cgrp for sched_ext. This operation my block.
+	 */
+	void (*cgroup_exit)(struct cgroup *cgrp);
+
+	/**
+	 * cgroup_prep_move - Prepare a task to be moved to a different cgroup
+	 * @p: task being moved
+	 * @from: cgroup @p is being moved from
+	 * @to: cgroup @p is being moved to
+	 *
+	 * Prepare @p for move from cgroup @from to @to. This operation may
+	 * block and can be used for allocations.
+	 *
+	 * Return 0 for success, -errno for failure. An error return aborts the
+	 * migration.
+	 */
+	s32 (*cgroup_prep_move)(struct task_struct *p,
+				struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_move - Commit cgroup move
+	 * @p: task being moved
+	 * @from: cgroup @p is being moved from
+	 * @to: cgroup @p is being moved to
+	 *
+	 * Commit the move. @p is dequeued during this operation.
+	 */
+	void (*cgroup_move)(struct task_struct *p,
+			    struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_cancel_move - Cancel cgroup move
+	 * @p: task whose cgroup move is being canceled
+	 * @from: cgroup @p was being moved from
+	 * @to: cgroup @p was being moved to
+	 *
+	 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+	 * Undo the preparation.
+	 */
+	void (*cgroup_cancel_move)(struct task_struct *p,
+				   struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_set_weight - A cgroup's weight is being changed
+	 * @cgrp: cgroup whose weight is being updated
+	 * @weight: new weight [1..10000]
+	 *
+	 * Update @tg's weight to @weight.
+	 */
+	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+#endif	/* CONFIG_CGROUPS */
+
 	/*
 	 * All online ops must come before ops.cpu_online().
 	 */
@@ -683,6 +772,11 @@ enum scx_kick_flags {
 	SCX_KICK_WAIT		= 1LLU << 2,
 };
 
+enum scx_tg_flags {
+	SCX_TG_ONLINE		= 1U << 0,
+	SCX_TG_INITED		= 1U << 1,
+};
+
 enum scx_ops_enable_state {
 	SCX_OPS_PREPPING,
 	SCX_OPS_ENABLING,
@@ -3235,6 +3329,28 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 		resched_curr(rq);
 }
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+static struct cgroup *tg_cgrp(struct task_group *tg)
+{
+	/*
+	 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
+	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
+	 * root cgroup.
+	 */
+	if (tg && tg->css.cgroup)
+		return tg->css.cgroup;
+	else
+		return &cgrp_dfl_root.cgrp;
+}
+
+#define SCX_INIT_TASK_ARGS_CGROUP(tg)		.cgroup = tg_cgrp(tg),
+
+#else	/* CONFIG_EXT_GROUP_SCHED */
+
+#define SCX_INIT_TASK_ARGS_CGROUP(tg)
+
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+
 static enum scx_task_state scx_get_task_state(const struct task_struct *p)
 {
 	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
@@ -3279,6 +3395,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
 
 	if (SCX_HAS_OP(init_task)) {
 		struct scx_init_task_args args = {
+			SCX_INIT_TASK_ARGS_CGROUP(tg)
 			.fork = fork,
 		};
 
@@ -3343,7 +3460,7 @@ static void scx_ops_enable_task(struct task_struct *p)
 	scx_set_task_state(p, SCX_TASK_ENABLED);
 
 	if (SCX_HAS_OP(set_weight))
-		SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight);
+		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
 }
 
 static void scx_ops_disable_task(struct task_struct *p)
@@ -3555,6 +3672,219 @@ bool scx_can_stop_tick(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+static bool cgroup_warned_missing_weight;
+static bool cgroup_warned_missing_idle;
+
+static void scx_cgroup_warn_missing_weight(struct task_group *tg)
+{
+	if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
+	    cgroup_warned_missing_weight)
+		return;
+
+	if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)
+		return;
+
+	pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n",
+		scx_ops.name);
+	cgroup_warned_missing_weight = true;
+}
+
+static void scx_cgroup_warn_missing_idle(struct task_group *tg)
+{
+	if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
+	    cgroup_warned_missing_idle)
+		return;
+
+	if (!tg->idle)
+		return;
+
+	pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n",
+		scx_ops.name);
+	cgroup_warned_missing_idle = true;
+}
+
+int scx_tg_online(struct task_group *tg)
+{
+	int ret = 0;
+
+	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	scx_cgroup_warn_missing_weight(tg);
+
+	if (SCX_HAS_OP(cgroup_init)) {
+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+				      tg->css.cgroup, &args);
+		if (!ret)
+			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+		else
+			ret = ops_sanitize_err("cgroup_init", ret);
+	} else {
+		tg->scx_flags |= SCX_TG_ONLINE;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+	return ret;
+}
+
+void scx_tg_offline(struct task_group *tg)
+{
+	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
+	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+int scx_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *p;
+	int ret;
+
+	/* released in scx_finish/cancel_attach() */
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (!scx_enabled())
+		return 0;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		struct cgroup *from = tg_cgrp(task_group(p));
+		struct cgroup *to = tg_cgrp(css_tg(css));
+
+		WARN_ON_ONCE(p->scx.cgrp_moving_from);
+
+		/*
+		 * sched_move_task() omits identity migrations. Let's match the
+		 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
+		 * always match one-to-one.
+		 */
+		if (from == to)
+			continue;
+
+		if (SCX_HAS_OP(cgroup_prep_move)) {
+			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
+					      p, from, css->cgroup);
+			if (ret)
+				goto err;
+		}
+
+		p->scx.cgrp_moving_from = from;
+	}
+
+	return 0;
+
+err:
+	cgroup_taskset_for_each(p, css, tset) {
+		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
+				    p->scx.cgrp_moving_from, css->cgroup);
+		p->scx.cgrp_moving_from = NULL;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+	return ops_sanitize_err("cgroup_prep_move", ret);
+}
+
+void scx_move_task(struct task_struct *p)
+{
+	if (!scx_enabled())
+		return;
+
+	/*
+	 * We're called from sched_move_task() which handles both cgroup and
+	 * autogroup moves. Ignore the latter.
+	 *
+	 * Also ignore exiting tasks, because in the exit path tasks transition
+	 * from the autogroup to the root group, so task_group_is_autogroup()
+	 * alone isn't able to catch exiting autogroup tasks. This is safe for
+	 * cgroup_move(), because cgroup migrations never happen for PF_EXITING
+	 * tasks.
+	 */
+	if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
+		return;
+
+	/*
+	 * @p must have ops.cgroup_prep_move() called on it and thus
+	 * cgrp_moving_from set.
+	 */
+	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
+			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+	p->scx.cgrp_moving_from = NULL;
+}
+
+void scx_cgroup_finish_attach(void)
+{
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *p;
+
+	if (!scx_enabled())
+		goto out_unlock;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
+				    p->scx.cgrp_moving_from, css->cgroup);
+		p->scx.cgrp_moving_from = NULL;
+	}
+out_unlock:
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_group_set_weight(struct task_group *tg, unsigned long weight)
+{
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (tg->scx_weight != weight) {
+		if (SCX_HAS_OP(cgroup_set_weight))
+			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
+				    tg_cgrp(tg), weight);
+		tg->scx_weight = weight;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_group_set_idle(struct task_group *tg, bool idle)
+{
+	percpu_down_read(&scx_cgroup_rwsem);
+	scx_cgroup_warn_missing_idle(tg);
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_lock(void)
+{
+	percpu_down_write(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_unlock(void)
+{
+	percpu_up_write(&scx_cgroup_rwsem);
+}
+
+#else	/* CONFIG_EXT_GROUP_SCHED */
+
+static inline void scx_cgroup_lock(void) {}
+static inline void scx_cgroup_unlock(void) {}
+
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+
 /*
  * Omitted operations:
  *
@@ -3686,6 +4016,96 @@ out_unlock_rcu:
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+static void scx_cgroup_exit(void)
+{
+	struct cgroup_subsys_state *css;
+
+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+	/*
+	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+	 * cgroups and exit all the inited ones, all online cgroups are exited.
+	 */
+	rcu_read_lock();
+	css_for_each_descendant_post(css, &root_task_group.css) {
+		struct task_group *tg = css_tg(css);
+
+		if (!(tg->scx_flags & SCX_TG_INITED))
+			continue;
+		tg->scx_flags &= ~SCX_TG_INITED;
+
+		if (!scx_ops.cgroup_exit)
+			continue;
+
+		if (WARN_ON_ONCE(!css_tryget(css)))
+			continue;
+		rcu_read_unlock();
+
+		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+
+		rcu_read_lock();
+		css_put(css);
+	}
+	rcu_read_unlock();
+}
+
+static int scx_cgroup_init(void)
+{
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+	cgroup_warned_missing_weight = false;
+	cgroup_warned_missing_idle = false;
+
+	/*
+	 * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
+	 * cgroups and init, all online cgroups are initialized.
+	 */
+	rcu_read_lock();
+	css_for_each_descendant_pre(css, &root_task_group.css) {
+		struct task_group *tg = css_tg(css);
+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+		scx_cgroup_warn_missing_weight(tg);
+		scx_cgroup_warn_missing_idle(tg);
+
+		if ((tg->scx_flags &
+		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
+			continue;
+
+		if (!scx_ops.cgroup_init) {
+			tg->scx_flags |= SCX_TG_INITED;
+			continue;
+		}
+
+		if (WARN_ON_ONCE(!css_tryget(css)))
+			continue;
+		rcu_read_unlock();
+
+		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+				      css->cgroup, &args);
+		if (ret) {
+			css_put(css);
+			return ret;
+		}
+		tg->scx_flags |= SCX_TG_INITED;
+
+		rcu_read_lock();
+		css_put(css);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+#else
+static void scx_cgroup_exit(void) {}
+static int scx_cgroup_init(void) { return 0; }
+#endif
+
 
 /********************************************************************************
  * Sysfs interface and ops enable/disable.
@@ -3978,11 +4398,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	WRITE_ONCE(scx_switching_all, false);
 
 	/*
-	 * Avoid racing against fork. See scx_ops_enable() for explanation on
-	 * the locking order.
+	 * Avoid racing against fork and cgroup changes. See scx_ops_enable()
+	 * for explanation on the locking order.
 	 */
 	percpu_down_write(&scx_fork_rwsem);
 	cpus_read_lock();
+	scx_cgroup_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
 	scx_task_iter_init(&sti);
@@ -4018,6 +4439,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	synchronize_rcu();
 
+	scx_cgroup_exit();
+
+	scx_cgroup_unlock();
 	cpus_read_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 
@@ -4574,11 +4998,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 			   scx_watchdog_timeout / 2);
 
 	/*
-	 * Lock out forks before opening the floodgate so that they don't wander
-	 * into the operations prematurely.
+	 * Lock out forks, cgroup on/offlining and moves before opening the
+	 * floodgate so that they don't wander into the operations prematurely.
+	 *
+	 * We don't need to keep the CPUs stable but static_branch_*() requires
+	 * cpus_read_lock() and scx_cgroup_rwsem must nest inside
+	 * cpu_hotplug_lock because of the following dependency chain:
+	 *
+	 *   cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
 	 *
-	 * We don't need to keep the CPUs stable but grab cpus_read_lock() to
-	 * ease future locking changes for cgroup suport.
+	 * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use
+	 * static_branch_*_cpuslocked().
 	 *
 	 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
 	 * following dependency chain:
@@ -4587,6 +5017,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	 */
 	percpu_down_write(&scx_fork_rwsem);
 	cpus_read_lock();
+	scx_cgroup_lock();
 
 	check_hotplug_seq(ops);
 
@@ -4609,6 +5040,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	}
 
+	/*
+	 * All cgroups should be initialized before letting in tasks. cgroup
+	 * on/offlining and task migrations are already locked out.
+	 */
+	ret = scx_cgroup_init();
+	if (ret)
+		goto err_disable_unlock_all;
+
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
 	/*
@@ -4700,6 +5139,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
+	scx_cgroup_unlock();
 	cpus_read_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 
@@ -4734,6 +5174,7 @@ err_unlock:
 	return ret;
 
 err_disable_unlock_all:
+	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 err_disable_unlock_cpus:
 	cpus_read_unlock();
@@ -4928,6 +5369,11 @@ static int bpf_scx_check_member(const struct btf_type *t,
 
 	switch (moff) {
 	case offsetof(struct sched_ext_ops, init_task):
+#ifdef CONFIG_EXT_GROUP_SCHED
+	case offsetof(struct sched_ext_ops, cgroup_init):
+	case offsetof(struct sched_ext_ops, cgroup_exit):
+	case offsetof(struct sched_ext_ops, cgroup_prep_move):
+#endif
 	case offsetof(struct sched_ext_ops, cpu_online):
 	case offsetof(struct sched_ext_ops, cpu_offline):
 	case offsetof(struct sched_ext_ops, init):
@@ -5002,6 +5448,14 @@ static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args
 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
 static void enable_stub(struct task_struct *p) {}
 static void disable_stub(struct task_struct *p) {}
+#ifdef CONFIG_EXT_GROUP_SCHED
+static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
+static void cgroup_exit_stub(struct cgroup *cgrp) {}
+static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
+static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {}
+#endif
 static void cpu_online_stub(s32 cpu) {}
 static void cpu_offline_stub(s32 cpu) {}
 static s32 init_stub(void) { return -EINVAL; }
@@ -5031,6 +5485,14 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 	.exit_task = exit_task_stub,
 	.enable = enable_stub,
 	.disable = disable_stub,
+#ifdef CONFIG_EXT_GROUP_SCHED
+	.cgroup_init = cgroup_init_stub,
+	.cgroup_exit = cgroup_exit_stub,
+	.cgroup_prep_move = cgroup_prep_move_stub,
+	.cgroup_move = cgroup_move_stub,
+	.cgroup_cancel_move = cgroup_cancel_move_stub,
+	.cgroup_set_weight = cgroup_set_weight_stub,
+#endif
 	.cpu_online = cpu_online_stub,
 	.cpu_offline = cpu_offline_stub,
 	.init = init_stub,
@@ -5280,7 +5742,8 @@ void __init init_sched_ext_class(void)
 	 * definitions so that BPF scheduler implementations can use them
 	 * through the generated vmlinux.h.
 	 */
-	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT);
+	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
+		   SCX_TG_ONLINE);
 
 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
 	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
@@ -6340,6 +6803,41 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
 	return cpu_rq(cpu);
 }
 
+/**
+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
+ * @p: task of interest
+ *
+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
+ * from the scheduler's POV. SCX operations should use this function to
+ * determine @p's current cgroup as, unlike following @p->cgroups,
+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
+ * rq-locked operations. Can be called on the parameter tasks of rq-locked
+ * operations. The restriction guarantees that @p's rq is locked by the caller.
+ */
+#ifdef CONFIG_CGROUP_SCHED
+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
+{
+	struct task_group *tg = p->sched_task_group;
+	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+
+	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+		goto out;
+
+	/*
+	 * A task_group may either be a cgroup or an autogroup. In the latter
+	 * case, @tg->css.cgroup is %NULL. A task_group can't become the other
+	 * kind once created.
+	 */
+	if (tg && tg->css.cgroup)
+		cgrp = tg->css.cgroup;
+	else
+		cgrp = &cgrp_dfl_root.cgrp;
+out:
+	cgroup_get(cgrp);
+	return cgrp;
+}
+#endif
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -6368,6 +6866,9 @@ BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+#ifdef CONFIG_CGROUP_SCHED
+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
+#endif
 BTF_KFUNCS_END(scx_kfunc_ids_any)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 32d3a51f591a..246019519231 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -67,3 +67,25 @@ static inline void scx_update_idle(struct rq *rq, bool idle)
 #else
 static inline void scx_update_idle(struct rq *rq, bool idle) {}
 #endif
+
+#ifdef CONFIG_CGROUP_SCHED
+#ifdef CONFIG_EXT_GROUP_SCHED
+int scx_tg_online(struct task_group *tg);
+void scx_tg_offline(struct task_group *tg);
+int scx_cgroup_can_attach(struct cgroup_taskset *tset);
+void scx_move_task(struct task_struct *p);
+void scx_cgroup_finish_attach(void);
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
+void scx_group_set_idle(struct task_group *tg, bool idle);
+#else	/* CONFIG_EXT_GROUP_SCHED */
+static inline int scx_tg_online(struct task_group *tg) { return 0; }
+static inline void scx_tg_offline(struct task_group *tg) {}
+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
+static inline void scx_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_finish_attach(void) {}
+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
+static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+#endif	/* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0163f4af1c6e..b912752197d4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -459,6 +459,11 @@ struct task_group {
 	struct rt_bandwidth	rt_bandwidth;
 #endif
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+	u32			scx_flags;	/* SCX_TG_* */
+	u32			scx_weight;
+#endif
+
 	struct rcu_head		rcu;
 	struct list_head	list;
 
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 20280df62857..457462b19966 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -61,6 +61,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
 
 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
index 44612fdaf399..00bfa9cb95d3 100644
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -95,6 +95,32 @@ void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p,
 void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p)
 {}
 
+s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp,
+		   struct scx_cgroup_init_args *args)
+{
+	return 0;
+}
+
+void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp)
+{}
+
+s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p,
+		   struct cgroup *from, struct cgroup *to)
+{
+	return 0;
+}
+
+void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p,
+		    struct cgroup *from, struct cgroup *to)
+{}
+
+void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
+	       struct cgroup *from, struct cgroup *to)
+{}
+
+void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
+{}
+
 s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
 {
 	return 0;
@@ -126,6 +152,12 @@ struct sched_ext_ops maximal_ops = {
 	.enable			= maximal_enable,
 	.exit_task		= maximal_exit_task,
 	.disable		= maximal_disable,
+	.cgroup_init		= maximal_cgroup_init,
+	.cgroup_exit		= maximal_cgroup_exit,
+	.cgroup_prep_move	= maximal_cgroup_prep_move,
+	.cgroup_move		= maximal_cgroup_move,
+	.cgroup_cancel_move	= maximal_cgroup_cancel_move,
+	.cgroup_set_weight	= maximal_cgroup_set_weight,
 	.init			= maximal_init,
 	.exit			= maximal_exit,
 	.name			= "maximal",
-- 
cgit v1.2.3


From a4103eacc2ab408bb65e9902f0857b219fb489de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 4 Sep 2024 10:24:59 -1000
Subject: sched_ext: Add a cgroup scheduler which uses flattened hierarchy

This patch adds scx_flatcg example scheduler which implements hierarchical
weight-based cgroup CPU control by flattening the cgroup hierarchy into a
single layer by compounding the active weight share at each level.

This flattening of hierarchy can bring a substantial performance gain when
the cgroup hierarchy is nested multiple levels. in a simple benchmark using
wrk[8] on apache serving a CGI script calculating sha1sum of a small file,
it outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
apache instances competing with 2:1 weight ratio nested four level deep.

However, the gain comes at the cost of not being able to properly handle
thundering herd of cgroups. For example, if many cgroups which are nested
behind a low priority parent cgroup wake up around the same time, they may
be able to consume more CPU cycles than they are entitled to. In many use
cases, this isn't a real concern especially given the performance gain.
Also, there are ways to mitigate the problem further by e.g. introducing an
extra scheduling layer on cgroup delegation boundaries.

v5: - Updated to specify SCX_OPS_HAS_CGROUP_WEIGHT instead of
      SCX_OPS_KNOB_CGROUP_WEIGHT.

v4: - Revert reference counted kptr for cgv_node as the change caused easily
      reproducible stalls.

v3: - Updated to reflect the core API changes including ops.init/exit_task()
      and direct dispatch from ops.select_cpu(). Fixes and improvements
      including additional statistics.

    - Use reference counted kptr for cgv_node instead of xchg'ing against
      stash location.

    - Dropped '-p' option.

v2: - Use SCX_BUG[_ON]() to simplify error handling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 tools/sched_ext/Makefile         |   2 +-
 tools/sched_ext/README.md        |  12 +
 tools/sched_ext/scx_flatcg.bpf.c | 949 +++++++++++++++++++++++++++++++++++++++
 tools/sched_ext/scx_flatcg.c     | 233 ++++++++++
 tools/sched_ext/scx_flatcg.h     |  51 +++
 5 files changed, 1246 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/scx_flatcg.bpf.c
 create mode 100644 tools/sched_ext/scx_flatcg.c
 create mode 100644 tools/sched_ext/scx_flatcg.h

(limited to 'tools')

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index bf7e108f5ae1..ca3815e572d8 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -176,7 +176,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
-c-sched-targets = scx_simple scx_qmap scx_central
+c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
index 8efe70cc4363..16a42e4060f6 100644
--- a/tools/sched_ext/README.md
+++ b/tools/sched_ext/README.md
@@ -192,6 +192,18 @@ where this could be particularly useful is running VMs, where running with
 infinite slices and no timer ticks allows the VM to avoid unnecessary expensive
 vmexits.
 
+## scx_flatcg
+
+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
+weight-based cgroup CPU control by flattening the cgroup hierarchy into a single
+layer, by compounding the active weight share at each level. The effect of this
+is a much more performant CPU controller, which does not need to descend down
+cgroup trees in order to properly compute a cgroup's share.
+
+Similar to scx_simple, in limited scenarios, this scheduler can perform
+reasonably well on single socket-socket systems with a unified L3 cache and show
+significantly lowered hierarchical scheduling overhead.
+
 
 # Troubleshooting
 
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
new file mode 100644
index 000000000000..3ab2b60781a0
--- /dev/null
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -0,0 +1,949 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements
+ * hierarchical weight-based cgroup CPU control by flattening the cgroup
+ * hierarchy into a single layer by compounding the active weight share at each
+ * level. Consider the following hierarchy with weights in parentheses:
+ *
+ * R + A (100) + B (100)
+ *   |         \ C (100)
+ *   \ D (200)
+ *
+ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks.
+ * Let's say all three have runnable tasks. The total share that each of these
+ * three cgroups is entitled to can be calculated by compounding its share at
+ * each level.
+ *
+ * For example, B is competing against C and in that competition its share is
+ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
+ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the
+ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
+ * eventual shaer is the same at 1/6. D is only competing at the top level and
+ * its share is 200/(100+200) == 2/3.
+ *
+ * So, instead of hierarchically scheduling level-by-level, we can consider it
+ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3
+ * and keep updating the eventual shares as the cgroups' runnable states change.
+ *
+ * This flattening of hierarchy can bring a substantial performance gain when
+ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using
+ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it
+ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
+ * apache instances competing with 2:1 weight ratio nested four level deep.
+ *
+ * However, the gain comes at the cost of not being able to properly handle
+ * thundering herd of cgroups. For example, if many cgroups which are nested
+ * behind a low priority parent cgroup wake up around the same time, they may be
+ * able to consume more CPU cycles than they are entitled to. In many use cases,
+ * this isn't a real concern especially given the performance gain. Also, there
+ * are ways to mitigate the problem further by e.g. introducing an extra
+ * scheduling layer on cgroup delegation boundaries.
+ *
+ * The scheduler first picks the cgroup to run and then schedule the tasks
+ * within by using nested weighted vtime scheduling by default. The
+ * cgroup-internal scheduling can be switched to FIFO with the -f option.
+ */
+#include <scx/common.bpf.h>
+#include "scx_flatcg.h"
+
+/*
+ * Maximum amount of retries to find a valid cgroup.
+ */
+#define CGROUP_MAX_RETRIES 1024
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
+const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
+const volatile bool fifo_sched;
+
+u64 cvtime_now;
+UEI_DEFINE(uei);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, FCG_NR_STATS);
+} stats SEC(".maps");
+
+static void stat_inc(enum fcg_stat_idx idx)
+{
+	u32 idx_v = idx;
+
+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
+	if (cnt_p)
+		(*cnt_p)++;
+}
+
+struct fcg_cpu_ctx {
+	u64			cur_cgid;
+	u64			cur_at;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, struct fcg_cpu_ctx);
+	__uint(max_entries, 1);
+} cpu_ctx SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct fcg_cgrp_ctx);
+} cgrp_ctx SEC(".maps");
+
+struct cgv_node {
+	struct bpf_rb_node	rb_node;
+	__u64			cvtime;
+	__u64			cgid;
+};
+
+private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
+private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node);
+
+struct cgv_node_stash {
+	struct cgv_node __kptr *node;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 16384);
+	__type(key, __u64);
+	__type(value, struct cgv_node_stash);
+} cgv_node_stash SEC(".maps");
+
+struct fcg_task_ctx {
+	u64		bypassed_at;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct fcg_task_ctx);
+} task_ctx SEC(".maps");
+
+/* gets inc'd on weight tree changes to expire the cached hweights */
+u64 hweight_gen = 1;
+
+static u64 div_round_up(u64 dividend, u64 divisor)
+{
+	return (dividend + divisor - 1) / divisor;
+}
+
+static bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct cgv_node *cgc_a, *cgc_b;
+
+	cgc_a = container_of(a, struct cgv_node, rb_node);
+	cgc_b = container_of(b, struct cgv_node, rb_node);
+
+	return cgc_a->cvtime < cgc_b->cvtime;
+}
+
+static struct fcg_cpu_ctx *find_cpu_ctx(void)
+{
+	struct fcg_cpu_ctx *cpuc;
+	u32 idx = 0;
+
+	cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx);
+	if (!cpuc) {
+		scx_bpf_error("cpu_ctx lookup failed");
+		return NULL;
+	}
+	return cpuc;
+}
+
+static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp)
+{
+	struct fcg_cgrp_ctx *cgc;
+
+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+	if (!cgc) {
+		scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id);
+		return NULL;
+	}
+	return cgc;
+}
+
+static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level)
+{
+	struct fcg_cgrp_ctx *cgc;
+
+	cgrp = bpf_cgroup_ancestor(cgrp, level);
+	if (!cgrp) {
+		scx_bpf_error("ancestor cgroup lookup failed");
+		return NULL;
+	}
+
+	cgc = find_cgrp_ctx(cgrp);
+	if (!cgc)
+		scx_bpf_error("ancestor cgrp_ctx lookup failed");
+	bpf_cgroup_release(cgrp);
+	return cgc;
+}
+
+static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
+{
+	int level;
+
+	if (!cgc->nr_active) {
+		stat_inc(FCG_STAT_HWT_SKIP);
+		return;
+	}
+
+	if (cgc->hweight_gen == hweight_gen) {
+		stat_inc(FCG_STAT_HWT_CACHE);
+		return;
+	}
+
+	stat_inc(FCG_STAT_HWT_UPDATES);
+	bpf_for(level, 0, cgrp->level + 1) {
+		struct fcg_cgrp_ctx *cgc;
+		bool is_active;
+
+		cgc = find_ancestor_cgrp_ctx(cgrp, level);
+		if (!cgc)
+			break;
+
+		if (!level) {
+			cgc->hweight = FCG_HWEIGHT_ONE;
+			cgc->hweight_gen = hweight_gen;
+		} else {
+			struct fcg_cgrp_ctx *pcgc;
+
+			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
+			if (!pcgc)
+				break;
+
+			/*
+			 * We can be oppotunistic here and not grab the
+			 * cgv_tree_lock and deal with the occasional races.
+			 * However, hweight updates are already cached and
+			 * relatively low-frequency. Let's just do the
+			 * straightforward thing.
+			 */
+			bpf_spin_lock(&cgv_tree_lock);
+			is_active = cgc->nr_active;
+			if (is_active) {
+				cgc->hweight_gen = pcgc->hweight_gen;
+				cgc->hweight =
+					div_round_up(pcgc->hweight * cgc->weight,
+						     pcgc->child_weight_sum);
+			}
+			bpf_spin_unlock(&cgv_tree_lock);
+
+			if (!is_active) {
+				stat_inc(FCG_STAT_HWT_RACE);
+				break;
+			}
+		}
+	}
+}
+
+static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
+{
+	u64 delta, cvtime, max_budget;
+
+	/*
+	 * A node which is on the rbtree can't be pointed to from elsewhere yet
+	 * and thus can't be updated and repositioned. Instead, we collect the
+	 * vtime deltas separately and apply it asynchronously here.
+	 */
+	delta = cgc->cvtime_delta;
+	__sync_fetch_and_sub(&cgc->cvtime_delta, delta);
+	cvtime = cgv_node->cvtime + delta;
+
+	/*
+	 * Allow a cgroup to carry the maximum budget proportional to its
+	 * hweight such that a full-hweight cgroup can immediately take up half
+	 * of the CPUs at the most while staying at the front of the rbtree.
+	 */
+	max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
+		(2 * FCG_HWEIGHT_ONE);
+	if (vtime_before(cvtime, cvtime_now - max_budget))
+		cvtime = cvtime_now - max_budget;
+
+	cgv_node->cvtime = cvtime;
+}
+
+static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
+{
+	struct cgv_node_stash *stash;
+	struct cgv_node *cgv_node;
+	u64 cgid = cgrp->kn->id;
+
+	/* paired with cmpxchg in try_pick_next_cgroup() */
+	if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) {
+		stat_inc(FCG_STAT_ENQ_SKIP);
+		return;
+	}
+
+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+	if (!stash) {
+		scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
+		return;
+	}
+
+	/* NULL if the node is already on the rbtree */
+	cgv_node = bpf_kptr_xchg(&stash->node, NULL);
+	if (!cgv_node) {
+		stat_inc(FCG_STAT_ENQ_RACE);
+		return;
+	}
+
+	bpf_spin_lock(&cgv_tree_lock);
+	cgrp_cap_budget(cgv_node, cgc);
+	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+	bpf_spin_unlock(&cgv_tree_lock);
+}
+
+static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
+{
+	/*
+	 * Tell fcg_stopping() that this bypassed the regular scheduling path
+	 * and should be force charged to the cgroup. 0 is used to indicate that
+	 * the task isn't bypassing, so if the current runtime is 0, go back by
+	 * one nanosecond.
+	 */
+	taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
+}
+
+s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+	struct fcg_task_ctx *taskc;
+	bool is_idle = false;
+	s32 cpu;
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+
+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+	if (!taskc) {
+		scx_bpf_error("task_ctx lookup failed");
+		return cpu;
+	}
+
+	/*
+	 * If select_cpu_dfl() is recommending local enqueue, the target CPU is
+	 * idle. Follow it and charge the cgroup later in fcg_stopping() after
+	 * the fact.
+	 */
+	if (is_idle) {
+		set_bypassed_at(p, taskc);
+		stat_inc(FCG_STAT_LOCAL);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+	}
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct fcg_task_ctx *taskc;
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+	if (!taskc) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	/*
+	 * Use the direct dispatching and force charging to deal with tasks with
+	 * custom affinities so that we don't have to worry about per-cgroup
+	 * dq's containing tasks that can't be executed from some CPUs.
+	 */
+	if (p->nr_cpus_allowed != nr_cpus) {
+		set_bypassed_at(p, taskc);
+
+		/*
+		 * The global dq is deprioritized as we don't want to let tasks
+		 * to boost themselves by constraining its cpumask. The
+		 * deprioritization is rather severe, so let's not apply that to
+		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
+		 * implement per-cgroup fallback dq's instead so that we have
+		 * more control over when tasks with custom cpumask get issued.
+		 */
+		if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
+			stat_inc(FCG_STAT_LOCAL);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		} else {
+			stat_inc(FCG_STAT_GLOBAL);
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		}
+		return;
+	}
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgc = find_cgrp_ctx(cgrp);
+	if (!cgc)
+		goto out_release;
+
+	if (fifo_sched) {
+		scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+	} else {
+		u64 tvtime = p->scx.dsq_vtime;
+
+		/*
+		 * Limit the amount of budget that an idling task can accumulate
+		 * to one slice.
+		 */
+		if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
+			tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
+
+		scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
+				       tvtime, enq_flags);
+	}
+
+	cgrp_enqueued(cgrp, cgc);
+out_release:
+	bpf_cgroup_release(cgrp);
+}
+
+/*
+ * Walk the cgroup tree to update the active weight sums as tasks wake up and
+ * sleep. The weight sums are used as the base when calculating the proportion a
+ * given cgroup or task is entitled to at each level.
+ */
+static void update_active_weight_sums(struct cgroup *cgrp, bool runnable)
+{
+	struct fcg_cgrp_ctx *cgc;
+	bool updated = false;
+	int idx;
+
+	cgc = find_cgrp_ctx(cgrp);
+	if (!cgc)
+		return;
+
+	/*
+	 * In most cases, a hot cgroup would have multiple threads going to
+	 * sleep and waking up while the whole cgroup stays active. In leaf
+	 * cgroups, ->nr_runnable which is updated with __sync operations gates
+	 * ->nr_active updates, so that we don't have to grab the cgv_tree_lock
+	 * repeatedly for a busy cgroup which is staying active.
+	 */
+	if (runnable) {
+		if (__sync_fetch_and_add(&cgc->nr_runnable, 1))
+			return;
+		stat_inc(FCG_STAT_ACT);
+	} else {
+		if (__sync_sub_and_fetch(&cgc->nr_runnable, 1))
+			return;
+		stat_inc(FCG_STAT_DEACT);
+	}
+
+	/*
+	 * If @cgrp is becoming runnable, its hweight should be refreshed after
+	 * it's added to the weight tree so that enqueue has the up-to-date
+	 * value. If @cgrp is becoming quiescent, the hweight should be
+	 * refreshed before it's removed from the weight tree so that the usage
+	 * charging which happens afterwards has access to the latest value.
+	 */
+	if (!runnable)
+		cgrp_refresh_hweight(cgrp, cgc);
+
+	/* propagate upwards */
+	bpf_for(idx, 0, cgrp->level) {
+		int level = cgrp->level - idx;
+		struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
+		bool propagate = false;
+
+		cgc = find_ancestor_cgrp_ctx(cgrp, level);
+		if (!cgc)
+			break;
+		if (level) {
+			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
+			if (!pcgc)
+				break;
+		}
+
+		/*
+		 * We need the propagation protected by a lock to synchronize
+		 * against weight changes. There's no reason to drop the lock at
+		 * each level but bpf_spin_lock() doesn't want any function
+		 * calls while locked.
+		 */
+		bpf_spin_lock(&cgv_tree_lock);
+
+		if (runnable) {
+			if (!cgc->nr_active++) {
+				updated = true;
+				if (pcgc) {
+					propagate = true;
+					pcgc->child_weight_sum += cgc->weight;
+				}
+			}
+		} else {
+			if (!--cgc->nr_active) {
+				updated = true;
+				if (pcgc) {
+					propagate = true;
+					pcgc->child_weight_sum -= cgc->weight;
+				}
+			}
+		}
+
+		bpf_spin_unlock(&cgv_tree_lock);
+
+		if (!propagate)
+			break;
+	}
+
+	if (updated)
+		__sync_fetch_and_add(&hweight_gen, 1);
+
+	if (runnable)
+		cgrp_refresh_hweight(cgrp, cgc);
+}
+
+void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
+{
+	struct cgroup *cgrp;
+
+	cgrp = scx_bpf_task_cgroup(p);
+	update_active_weight_sums(cgrp, true);
+	bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
+{
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	if (fifo_sched)
+		return;
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgc = find_cgrp_ctx(cgrp);
+	if (cgc) {
+		/*
+		 * @cgc->tvtime_now always progresses forward as tasks start
+		 * executing. The test and update can be performed concurrently
+		 * from multiple CPUs and thus racy. Any error should be
+		 * contained and temporary. Let's just live with it.
+		 */
+		if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
+			cgc->tvtime_now = p->scx.dsq_vtime;
+	}
+	bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
+{
+	struct fcg_task_ctx *taskc;
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	/*
+	 * Scale the execution time by the inverse of the weight and charge.
+	 *
+	 * Note that the default yield implementation yields by setting
+	 * @p->scx.slice to zero and the following would treat the yielding task
+	 * as if it has consumed all its slice. If this penalizes yielding tasks
+	 * too much, determine the execution time by taking explicit timestamps
+	 * instead of depending on @p->scx.slice.
+	 */
+	if (!fifo_sched)
+		p->scx.dsq_vtime +=
+			(SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+
+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+	if (!taskc) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	if (!taskc->bypassed_at)
+		return;
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgc = find_cgrp_ctx(cgrp);
+	if (cgc) {
+		__sync_fetch_and_add(&cgc->cvtime_delta,
+				     p->se.sum_exec_runtime - taskc->bypassed_at);
+		taskc->bypassed_at = 0;
+	}
+	bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
+{
+	struct cgroup *cgrp;
+
+	cgrp = scx_bpf_task_cgroup(p);
+	update_active_weight_sums(cgrp, false);
+	bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
+{
+	struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
+
+	cgc = find_cgrp_ctx(cgrp);
+	if (!cgc)
+		return;
+
+	if (cgrp->level) {
+		pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1);
+		if (!pcgc)
+			return;
+	}
+
+	bpf_spin_lock(&cgv_tree_lock);
+	if (pcgc && cgc->nr_active)
+		pcgc->child_weight_sum += (s64)weight - cgc->weight;
+	cgc->weight = weight;
+	bpf_spin_unlock(&cgv_tree_lock);
+}
+
+static bool try_pick_next_cgroup(u64 *cgidp)
+{
+	struct bpf_rb_node *rb_node;
+	struct cgv_node_stash *stash;
+	struct cgv_node *cgv_node;
+	struct fcg_cgrp_ctx *cgc;
+	struct cgroup *cgrp;
+	u64 cgid;
+
+	/* pop the front cgroup and wind cvtime_now accordingly */
+	bpf_spin_lock(&cgv_tree_lock);
+
+	rb_node = bpf_rbtree_first(&cgv_tree);
+	if (!rb_node) {
+		bpf_spin_unlock(&cgv_tree_lock);
+		stat_inc(FCG_STAT_PNC_NO_CGRP);
+		*cgidp = 0;
+		return true;
+	}
+
+	rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
+	bpf_spin_unlock(&cgv_tree_lock);
+
+	if (!rb_node) {
+		/*
+		 * This should never happen. bpf_rbtree_first() was called
+		 * above while the tree lock was held, so the node should
+		 * always be present.
+		 */
+		scx_bpf_error("node could not be removed");
+		return true;
+	}
+
+	cgv_node = container_of(rb_node, struct cgv_node, rb_node);
+	cgid = cgv_node->cgid;
+
+	if (vtime_before(cvtime_now, cgv_node->cvtime))
+		cvtime_now = cgv_node->cvtime;
+
+	/*
+	 * If lookup fails, the cgroup's gone. Free and move on. See
+	 * fcg_cgroup_exit().
+	 */
+	cgrp = bpf_cgroup_from_id(cgid);
+	if (!cgrp) {
+		stat_inc(FCG_STAT_PNC_GONE);
+		goto out_free;
+	}
+
+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+	if (!cgc) {
+		bpf_cgroup_release(cgrp);
+		stat_inc(FCG_STAT_PNC_GONE);
+		goto out_free;
+	}
+
+	if (!scx_bpf_consume(cgid)) {
+		bpf_cgroup_release(cgrp);
+		stat_inc(FCG_STAT_PNC_EMPTY);
+		goto out_stash;
+	}
+
+	/*
+	 * Successfully consumed from the cgroup. This will be our current
+	 * cgroup for the new slice. Refresh its hweight.
+	 */
+	cgrp_refresh_hweight(cgrp, cgc);
+
+	bpf_cgroup_release(cgrp);
+
+	/*
+	 * As the cgroup may have more tasks, add it back to the rbtree. Note
+	 * that here we charge the full slice upfront and then exact later
+	 * according to the actual consumption. This prevents lowpri thundering
+	 * herd from saturating the machine.
+	 */
+	bpf_spin_lock(&cgv_tree_lock);
+	cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1);
+	cgrp_cap_budget(cgv_node, cgc);
+	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+	bpf_spin_unlock(&cgv_tree_lock);
+
+	*cgidp = cgid;
+	stat_inc(FCG_STAT_PNC_NEXT);
+	return true;
+
+out_stash:
+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+	if (!stash) {
+		stat_inc(FCG_STAT_PNC_GONE);
+		goto out_free;
+	}
+
+	/*
+	 * Paired with cmpxchg in cgrp_enqueued(). If they see the following
+	 * transition, they'll enqueue the cgroup. If they are earlier, we'll
+	 * see their task in the dq below and requeue the cgroup.
+	 */
+	__sync_val_compare_and_swap(&cgc->queued, 1, 0);
+
+	if (scx_bpf_dsq_nr_queued(cgid)) {
+		bpf_spin_lock(&cgv_tree_lock);
+		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+		bpf_spin_unlock(&cgv_tree_lock);
+		stat_inc(FCG_STAT_PNC_RACE);
+	} else {
+		cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
+		if (cgv_node) {
+			scx_bpf_error("unexpected !NULL cgv_node stash");
+			goto out_free;
+		}
+	}
+
+	return false;
+
+out_free:
+	bpf_obj_drop(cgv_node);
+	return false;
+}
+
+void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
+{
+	struct fcg_cpu_ctx *cpuc;
+	struct fcg_cgrp_ctx *cgc;
+	struct cgroup *cgrp;
+	u64 now = bpf_ktime_get_ns();
+	bool picked_next = false;
+
+	cpuc = find_cpu_ctx();
+	if (!cpuc)
+		return;
+
+	if (!cpuc->cur_cgid)
+		goto pick_next_cgroup;
+
+	if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
+		if (scx_bpf_consume(cpuc->cur_cgid)) {
+			stat_inc(FCG_STAT_CNS_KEEP);
+			return;
+		}
+		stat_inc(FCG_STAT_CNS_EMPTY);
+	} else {
+		stat_inc(FCG_STAT_CNS_EXPIRE);
+	}
+
+	/*
+	 * The current cgroup is expiring. It was already charged a full slice.
+	 * Calculate the actual usage and accumulate the delta.
+	 */
+	cgrp = bpf_cgroup_from_id(cpuc->cur_cgid);
+	if (!cgrp) {
+		stat_inc(FCG_STAT_CNS_GONE);
+		goto pick_next_cgroup;
+	}
+
+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+	if (cgc) {
+		/*
+		 * We want to update the vtime delta and then look for the next
+		 * cgroup to execute but the latter needs to be done in a loop
+		 * and we can't keep the lock held. Oh well...
+		 */
+		bpf_spin_lock(&cgv_tree_lock);
+		__sync_fetch_and_add(&cgc->cvtime_delta,
+				     (cpuc->cur_at + cgrp_slice_ns - now) *
+				     FCG_HWEIGHT_ONE / (cgc->hweight ?: 1));
+		bpf_spin_unlock(&cgv_tree_lock);
+	} else {
+		stat_inc(FCG_STAT_CNS_GONE);
+	}
+
+	bpf_cgroup_release(cgrp);
+
+pick_next_cgroup:
+	cpuc->cur_at = now;
+
+	if (scx_bpf_consume(SCX_DSQ_GLOBAL)) {
+		cpuc->cur_cgid = 0;
+		return;
+	}
+
+	bpf_repeat(CGROUP_MAX_RETRIES) {
+		if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
+			picked_next = true;
+			break;
+		}
+	}
+
+	/*
+	 * This only happens if try_pick_next_cgroup() races against enqueue
+	 * path for more than CGROUP_MAX_RETRIES times, which is extremely
+	 * unlikely and likely indicates an underlying bug. There shouldn't be
+	 * any stall risk as the race is against enqueue.
+	 */
+	if (!picked_next)
+		stat_inc(FCG_STAT_PNC_FAIL);
+}
+
+s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct fcg_task_ctx *taskc;
+	struct fcg_cgrp_ctx *cgc;
+
+	/*
+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
+	 * in this function and the following will automatically use GFP_KERNEL.
+	 */
+	taskc = bpf_task_storage_get(&task_ctx, p, 0,
+				     BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!taskc)
+		return -ENOMEM;
+
+	taskc->bypassed_at = 0;
+
+	if (!(cgc = find_cgrp_ctx(args->cgroup)))
+		return -ENOENT;
+
+	p->scx.dsq_vtime = cgc->tvtime_now;
+
+	return 0;
+}
+
+int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
+			     struct scx_cgroup_init_args *args)
+{
+	struct fcg_cgrp_ctx *cgc;
+	struct cgv_node *cgv_node;
+	struct cgv_node_stash empty_stash = {}, *stash;
+	u64 cgid = cgrp->kn->id;
+	int ret;
+
+	/*
+	 * Technically incorrect as cgroup ID is full 64bit while dq ID is
+	 * 63bit. Should not be a problem in practice and easy to spot in the
+	 * unlikely case that it breaks.
+	 */
+	ret = scx_bpf_create_dsq(cgid, -1);
+	if (ret)
+		return ret;
+
+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!cgc) {
+		ret = -ENOMEM;
+		goto err_destroy_dsq;
+	}
+
+	cgc->weight = args->weight;
+	cgc->hweight = FCG_HWEIGHT_ONE;
+
+	ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash,
+				  BPF_NOEXIST);
+	if (ret) {
+		if (ret != -ENOMEM)
+			scx_bpf_error("unexpected stash creation error (%d)",
+				      ret);
+		goto err_destroy_dsq;
+	}
+
+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+	if (!stash) {
+		scx_bpf_error("unexpected cgv_node stash lookup failure");
+		ret = -ENOENT;
+		goto err_destroy_dsq;
+	}
+
+	cgv_node = bpf_obj_new(struct cgv_node);
+	if (!cgv_node) {
+		ret = -ENOMEM;
+		goto err_del_cgv_node;
+	}
+
+	cgv_node->cgid = cgid;
+	cgv_node->cvtime = cvtime_now;
+
+	cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
+	if (cgv_node) {
+		scx_bpf_error("unexpected !NULL cgv_node stash");
+		ret = -EBUSY;
+		goto err_drop;
+	}
+
+	return 0;
+
+err_drop:
+	bpf_obj_drop(cgv_node);
+err_del_cgv_node:
+	bpf_map_delete_elem(&cgv_node_stash, &cgid);
+err_destroy_dsq:
+	scx_bpf_destroy_dsq(cgid);
+	return ret;
+}
+
+void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+
+	/*
+	 * For now, there's no way find and remove the cgv_node if it's on the
+	 * cgv_tree. Let's drain them in the dispatch path as they get popped
+	 * off the front of the tree.
+	 */
+	bpf_map_delete_elem(&cgv_node_stash, &cgid);
+	scx_bpf_destroy_dsq(cgid);
+}
+
+void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
+		    struct cgroup *from, struct cgroup *to)
+{
+	struct fcg_cgrp_ctx *from_cgc, *to_cgc;
+	s64 vtime_delta;
+
+	/* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
+	if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
+		return;
+
+	vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
+	p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
+}
+
+void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(flatcg_ops,
+	       .select_cpu		= (void *)fcg_select_cpu,
+	       .enqueue			= (void *)fcg_enqueue,
+	       .dispatch		= (void *)fcg_dispatch,
+	       .runnable		= (void *)fcg_runnable,
+	       .running			= (void *)fcg_running,
+	       .stopping		= (void *)fcg_stopping,
+	       .quiescent		= (void *)fcg_quiescent,
+	       .init_task		= (void *)fcg_init_task,
+	       .cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
+	       .cgroup_init		= (void *)fcg_cgroup_init,
+	       .cgroup_exit		= (void *)fcg_cgroup_exit,
+	       .cgroup_move		= (void *)fcg_cgroup_move,
+	       .exit			= (void *)fcg_exit,
+	       .flags			= SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
+	       .name			= "flatcg");
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
new file mode 100644
index 000000000000..5d24ca9c29d9
--- /dev/null
+++ b/tools/sched_ext/scx_flatcg.c
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <limits.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <time.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_flatcg.h"
+#include "scx_flatcg.bpf.skel.h"
+
+#ifndef FILEID_KERNFS
+#define FILEID_KERNFS		0xfe
+#endif
+
+const char help_fmt[] =
+"A flattened cgroup hierarchy sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
+"\n"
+"  -s SLICE_US   Override slice duration\n"
+"  -i INTERVAL   Report interval\n"
+"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
+{
+	FILE *fp;
+	char buf[4096];
+	char *line, *cur = NULL, *tok;
+	__u64 sum = 0, idle = 0;
+	__u64 delta_sum, delta_idle;
+	int idx;
+
+	fp = fopen("/proc/stat", "r");
+	if (!fp) {
+		perror("fopen(\"/proc/stat\")");
+		return 0.0;
+	}
+
+	if (!fgets(buf, sizeof(buf), fp)) {
+		perror("fgets(\"/proc/stat\")");
+		fclose(fp);
+		return 0.0;
+	}
+	fclose(fp);
+
+	line = buf;
+	for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) {
+		char *endp = NULL;
+		__u64 v;
+
+		if (idx == 0) {
+			line = NULL;
+			continue;
+		}
+		v = strtoull(tok, &endp, 0);
+		if (!endp || *endp != '\0') {
+			fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n",
+				idx, tok);
+			continue;
+		}
+		sum += v;
+		if (idx == 4)
+			idle = v;
+	}
+
+	delta_sum = sum - *last_sum;
+	delta_idle = idle - *last_idle;
+	*last_sum = sum;
+	*last_idle = idle;
+
+	return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
+}
+
+static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
+{
+	__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
+	__u32 idx;
+
+	memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
+
+	for (idx = 0; idx < FCG_NR_STATS; idx++) {
+		int ret, cpu;
+
+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+					  &idx, cnts[idx]);
+		if (ret < 0)
+			continue;
+		for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
+			stats[idx] += cnts[idx][cpu];
+	}
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_flatcg *skel;
+	struct bpf_link *link;
+	struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
+	bool dump_cgrps = false;
+	__u64 last_cpu_sum = 0, last_cpu_idle = 0;
+	__u64 last_stats[FCG_NR_STATS] = {};
+	unsigned long seq = 0;
+	__s32 opt;
+	__u64 ecode;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+restart:
+	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
+
+	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+
+	while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
+		double v;
+
+		switch (opt) {
+		case 's':
+			v = strtod(optarg, NULL);
+			skel->rodata->cgrp_slice_ns = v * 1000;
+			break;
+		case 'i':
+			v = strtod(optarg, NULL);
+			intv_ts.tv_sec = v;
+			intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
+			break;
+		case 'd':
+			dump_cgrps = true;
+			break;
+		case 'f':
+			skel->rodata->fifo_sched = true;
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		case 'h':
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
+	       (double)skel->rodata->cgrp_slice_ns / 1000000.0,
+	       (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
+	       dump_cgrps);
+
+	SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
+	link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		__u64 acc_stats[FCG_NR_STATS];
+		__u64 stats[FCG_NR_STATS];
+		float cpu_util;
+		int i;
+
+		cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle);
+
+		fcg_read_stats(skel, acc_stats);
+		for (i = 0; i < FCG_NR_STATS; i++)
+			stats[i] = acc_stats[i] - last_stats[i];
+
+		memcpy(last_stats, acc_stats, sizeof(acc_stats));
+
+		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
+		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
+		printf("       act:%6llu  deact:%6llu global:%6llu local:%6llu\n",
+		       stats[FCG_STAT_ACT],
+		       stats[FCG_STAT_DEACT],
+		       stats[FCG_STAT_GLOBAL],
+		       stats[FCG_STAT_LOCAL]);
+		printf("HWT  cache:%6llu update:%6llu   skip:%6llu  race:%6llu\n",
+		       stats[FCG_STAT_HWT_CACHE],
+		       stats[FCG_STAT_HWT_UPDATES],
+		       stats[FCG_STAT_HWT_SKIP],
+		       stats[FCG_STAT_HWT_RACE]);
+		printf("ENQ   skip:%6llu   race:%6llu\n",
+		       stats[FCG_STAT_ENQ_SKIP],
+		       stats[FCG_STAT_ENQ_RACE]);
+		printf("CNS   keep:%6llu expire:%6llu  empty:%6llu  gone:%6llu\n",
+		       stats[FCG_STAT_CNS_KEEP],
+		       stats[FCG_STAT_CNS_EXPIRE],
+		       stats[FCG_STAT_CNS_EMPTY],
+		       stats[FCG_STAT_CNS_GONE]);
+		printf("PNC   next:%6llu  empty:%6llu nocgrp:%6llu  gone:%6llu race:%6llu fail:%6llu\n",
+		       stats[FCG_STAT_PNC_NEXT],
+		       stats[FCG_STAT_PNC_EMPTY],
+		       stats[FCG_STAT_PNC_NO_CGRP],
+		       stats[FCG_STAT_PNC_GONE],
+		       stats[FCG_STAT_PNC_RACE],
+		       stats[FCG_STAT_PNC_FAIL]);
+		printf("BAD remove:%6llu\n",
+		       acc_stats[FCG_STAT_BAD_REMOVAL]);
+		fflush(stdout);
+
+		nanosleep(&intv_ts, NULL);
+	}
+
+	bpf_link__destroy(link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_flatcg__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h
new file mode 100644
index 000000000000..6f2ea50acb1c
--- /dev/null
+++ b/tools/sched_ext/scx_flatcg.h
@@ -0,0 +1,51 @@
+#ifndef __SCX_EXAMPLE_FLATCG_H
+#define __SCX_EXAMPLE_FLATCG_H
+
+enum {
+	FCG_HWEIGHT_ONE		= 1LLU << 16,
+};
+
+enum fcg_stat_idx {
+	FCG_STAT_ACT,
+	FCG_STAT_DEACT,
+	FCG_STAT_LOCAL,
+	FCG_STAT_GLOBAL,
+
+	FCG_STAT_HWT_UPDATES,
+	FCG_STAT_HWT_CACHE,
+	FCG_STAT_HWT_SKIP,
+	FCG_STAT_HWT_RACE,
+
+	FCG_STAT_ENQ_SKIP,
+	FCG_STAT_ENQ_RACE,
+
+	FCG_STAT_CNS_KEEP,
+	FCG_STAT_CNS_EXPIRE,
+	FCG_STAT_CNS_EMPTY,
+	FCG_STAT_CNS_GONE,
+
+	FCG_STAT_PNC_NO_CGRP,
+	FCG_STAT_PNC_NEXT,
+	FCG_STAT_PNC_EMPTY,
+	FCG_STAT_PNC_GONE,
+	FCG_STAT_PNC_RACE,
+	FCG_STAT_PNC_FAIL,
+
+	FCG_STAT_BAD_REMOVAL,
+
+	FCG_NR_STATS,
+};
+
+struct fcg_cgrp_ctx {
+	u32			nr_active;
+	u32			nr_runnable;
+	u32			queued;
+	u32			weight;
+	u32			hweight;
+	u64			child_weight_sum;
+	u64			hweight_gen;
+	s64			cvtime_delta;
+	u64			tvtime_now;
+};
+
+#endif /* __SCX_EXAMPLE_FLATCG_H */
-- 
cgit v1.2.3


From f0a6ecebd858658df213d114b0530f8f0b96e396 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 5 Sep 2024 00:30:21 +0900
Subject: selftests/ftrace: Fix eventfs ownership testcase to find mount point

Fix eventfs ownership testcase to find mount point if stat -c "%m" failed.
This can happen on the system based on busybox. In this case, this will
try to use the current working directory, which should be a tracefs top
directory (and eventfs is mounted as a part of tracefs.)
If it does not work, the test is skipped as UNRESOLVED because of
the environmental problem.

Fixes: ee9793be08b1 ("tracing/selftests: Add ownership modification tests for eventfs")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../selftests/ftrace/test.d/00basic/test_ownership.tc        | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
index 71e43a92352a..094419e190c2 100644
--- a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
+++ b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
@@ -6,6 +6,18 @@ original_group=`stat -c "%g" .`
 original_owner=`stat -c "%u" .`
 
 mount_point=`stat -c '%m' .`
+
+# If stat -c '%m' does not work (e.g. busybox) or failed, try to use the
+# current working directory (which should be a tracefs) as the mount point.
+if [ ! -d "$mount_point" ]; then
+	if mount | grep -qw $PWD ; then
+		mount_point=$PWD
+	else
+		# If PWD doesn't work, that is an environmental problem.
+		exit_unresolved
+	fi
+fi
+
 mount_options=`mount | grep "$mount_point" | sed -e 's/.*(\(.*\)).*/\1/'`
 
 # find another owner and group that is not the original
-- 
cgit v1.2.3


From eff5b5fffc1d39fb9e5e66d6aa4ed3fcb109caac Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Wed, 4 Sep 2024 15:12:56 -0700
Subject: selftests/bpf: Add a selftest for x86 jit convergence issues

The core part of the selftest, i.e., the je <-> jmp cycle, mimics the
original sched-ext bpf program. The test will fail without the
previous patch.

I tried to create some cases for other potential cycles
(je <-> je, jmp <-> je and jmp <-> jmp) with similar pattern
to the test in this patch, but failed. So this patch
only contains one test for je <-> jmp cycle.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240904221256.37389-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |   2 +
 .../selftests/bpf/progs/verifier_jit_convergence.c | 114 +++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_jit_convergence.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 80a90c627182..df398e714dff 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -39,6 +39,7 @@
 #include "verifier_int_ptr.skel.h"
 #include "verifier_iterating_callbacks.skel.h"
 #include "verifier_jeq_infer_not_null.skel.h"
+#include "verifier_jit_convergence.skel.h"
 #include "verifier_ld_ind.skel.h"
 #include "verifier_ldsx.skel.h"
 #include "verifier_leak_ptr.skel.h"
@@ -163,6 +164,7 @@ void test_verifier_helper_value_access(void)  { RUN(verifier_helper_value_access
 void test_verifier_int_ptr(void)              { RUN(verifier_int_ptr); }
 void test_verifier_iterating_callbacks(void)  { RUN(verifier_iterating_callbacks); }
 void test_verifier_jeq_infer_not_null(void)   { RUN(verifier_jeq_infer_not_null); }
+void test_verifier_jit_convergence(void)      { RUN(verifier_jit_convergence); }
 void test_verifier_ld_ind(void)               { RUN(verifier_ld_ind); }
 void test_verifier_ldsx(void)                  { RUN(verifier_ldsx); }
 void test_verifier_leak_ptr(void)             { RUN(verifier_leak_ptr); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_convergence.c b/tools/testing/selftests/bpf/progs/verifier_jit_convergence.c
new file mode 100644
index 000000000000..9f3f2b7db450
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_jit_convergence.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+struct value_t {
+	long long a[32];
+};
+
+struct {
+        __uint(type, BPF_MAP_TYPE_HASH);
+        __uint(max_entries, 1);
+        __type(key, long long);
+        __type(value, struct value_t);
+} map_hash SEC(".maps");
+
+SEC("socket")
+__description("bpf_jit_convergence je <-> jmp")
+__success __retval(0)
+__arch_x86_64
+__jited("	pushq	%rbp")
+__naked void btf_jit_convergence_je_jmp(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"if r0 == 0 goto l20_%=;"
+	"if r0 == 1 goto l21_%=;"
+	"if r0 == 2 goto l22_%=;"
+	"if r0 == 3 goto l23_%=;"
+	"if r0 == 4 goto l24_%=;"
+	"call %[bpf_get_prandom_u32];"
+	"call %[bpf_get_prandom_u32];"
+"l20_%=:"
+"l21_%=:"
+"l22_%=:"
+"l23_%=:"
+"l24_%=:"
+	"r1 = 0;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"r2 = r10;"
+	"r2 += -8;"
+	"r1 = %[map_hash] ll;"
+	"call %[bpf_map_lookup_elem];"
+	"if r0 == 0 goto l1_%=;"
+	"r6 = r0;"
+	"call %[bpf_get_prandom_u32];"
+	"r7 = r0;"
+	"r5 = r6;"
+	"if r0 != 0x0 goto l12_%=;"
+	"call %[bpf_get_prandom_u32];"
+	"r1 = r0;"
+	"r2 = r6;"
+	"if r1 == 0x0 goto l0_%=;"
+"l9_%=:"
+	"r2 = *(u64 *)(r6 + 0x0);"
+	"r2 += 0x1;"
+	"*(u64 *)(r6 + 0x0) = r2;"
+	"goto l1_%=;"
+"l12_%=:"
+	"r1 = r7;"
+	"r1 += 0x98;"
+	"r2 = r5;"
+	"r2 += 0x90;"
+	"r2 = *(u32 *)(r2 + 0x0);"
+	"r3 = r7;"
+	"r3 &= 0x1;"
+	"r2 *= 0xa8;"
+	"if r3 == 0x0 goto l2_%=;"
+	"r1 += r2;"
+	"r1 -= r7;"
+	"r1 += 0x8;"
+	"if r1 <= 0xb20 goto l3_%=;"
+	"r1 = 0x0;"
+	"goto l4_%=;"
+"l3_%=:"
+	"r1 += r7;"
+"l4_%=:"
+	"if r1 == 0x0 goto l8_%=;"
+	"goto l9_%=;"
+"l2_%=:"
+	"r1 += r2;"
+	"r1 -= r7;"
+	"r1 += 0x10;"
+	"if r1 <= 0xb20 goto l6_%=;"
+	"r1 = 0x0;"
+	"goto l7_%=;"
+"l6_%=:"
+	"r1 += r7;"
+"l7_%=:"
+	"if r1 == 0x0 goto l8_%=;"
+	"goto l9_%=;"
+"l0_%=:"
+	"r1 = 0x3;"
+	"*(u64 *)(r10 - 0x10) = r1;"
+	"r2 = r1;"
+	"goto l1_%=;"
+"l8_%=:"
+	"r1 = r5;"
+	"r1 += 0x4;"
+	"r1 = *(u32 *)(r1 + 0x0);"
+	"*(u64 *)(r10 - 0x8) = r1;"
+"l1_%=:"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_map_lookup_elem),
+	  __imm_addr(map_hash)
+	: __clobber_all);
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 6ffa72acc9c933a065782cb49afde1130ca722f7 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Wed, 4 Sep 2024 09:44:41 +0800
Subject: selftests: net: convert comma to semicolon

Replace comma between expressions with semicolons.

Using a ',' in place of a ';' can have unintended side effects.
Although that is not the case here, it is seems best to use ';'
unless ',' is intended.

Found by inspection.
No functional change intended.
Compile tested only.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240904014441.1065753-1-nichen@iscas.ac.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/psock_fanout.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
index 1a736f700be4..4f31e92ebd96 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -165,9 +165,9 @@ static void sock_fanout_set_ebpf(int fd)
 	attr.insns = (unsigned long) prog;
 	attr.insn_cnt = ARRAY_SIZE(prog);
 	attr.license = (unsigned long) "GPL";
-	attr.log_buf = (unsigned long) log_buf,
-	attr.log_size = sizeof(log_buf),
-	attr.log_level = 1,
+	attr.log_buf = (unsigned long) log_buf;
+	attr.log_size = sizeof(log_buf);
+	attr.log_level = 1;
 
 	pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
 	if (pfd < 0) {
-- 
cgit v1.2.3


From e4db2a821b6cff4bd59c0efabdbfdd84ac6005c1 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Sat, 31 Aug 2024 04:19:31 +0000
Subject: libbpf: Access first syscall argument with CO-RE direct read on s390

Currently PT_REGS_PARM1 SYSCALL(x) is consistent with PT_REGS_PARM1_CORE
SYSCALL(x), which will introduce the overhead of BPF_CORE_READ(), taking
into account the read pt_regs comes directly from the context, let's use
CO-RE direct read to access the first system call argument.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240831041934.1629216-2-pulehui@huaweicloud.com
---
 tools/lib/bpf/bpf_tracing.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 9314fa95f04e..1297d8d297d1 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -163,7 +163,7 @@
 
 struct pt_regs___s390 {
 	unsigned long orig_gpr2;
-};
+} __attribute__((preserve_access_index));
 
 /* s390 provides user_pt_regs instead of struct pt_regs to userspace */
 #define __PT_REGS_CAST(x) ((const user_pt_regs *)(x))
@@ -179,7 +179,7 @@ struct pt_regs___s390 {
 #define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
 #define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
 #define __PT_PARM6_SYSCALL_REG gprs[7]
-#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x)
+#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___s390 *)(x))->__PT_PARM1_SYSCALL_REG)
 #define PT_REGS_PARM1_CORE_SYSCALL(x) \
 	BPF_CORE_READ((const struct pt_regs___s390 *)(x), __PT_PARM1_SYSCALL_REG)
 
-- 
cgit v1.2.3


From 9ab94078e868a45cbd23828b8377600b83a41fac Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Sat, 31 Aug 2024 04:19:32 +0000
Subject: libbpf: Access first syscall argument with CO-RE direct read on arm64

Currently PT_REGS_PARM1 SYSCALL(x) is consistent with PT_REGS_PARM1_CORE
SYSCALL(x), which will introduce the overhead of BPF_CORE_READ(), taking
into account the read pt_regs comes directly from the context, let's use
CO-RE direct read to access the first system call argument.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/bpf/20240831041934.1629216-3-pulehui@huaweicloud.com
---
 tools/lib/bpf/bpf_tracing.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 1297d8d297d1..c15831c98990 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -222,7 +222,7 @@ struct pt_regs___s390 {
 
 struct pt_regs___arm64 {
 	unsigned long orig_x0;
-};
+} __attribute__((preserve_access_index));
 
 /* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */
 #define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x))
@@ -241,7 +241,7 @@ struct pt_regs___arm64 {
 #define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
 #define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
 #define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
-#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x)
+#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___arm64 *)(x))->__PT_PARM1_SYSCALL_REG)
 #define PT_REGS_PARM1_CORE_SYSCALL(x) \
 	BPF_CORE_READ((const struct pt_regs___arm64 *)(x), __PT_PARM1_SYSCALL_REG)
 
-- 
cgit v1.2.3


From 4a4c4c0d0a42079d2fa97a232895c56ac2f3f573 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Sat, 31 Aug 2024 04:19:33 +0000
Subject: selftests/bpf: Enable test_bpf_syscall_macro: Syscall_arg1 on s390
 and arm64

Considering that CO-RE direct read access to the first system call
argument is already available on s390 and arm64, let's enable
test_bpf_syscall_macro:syscall_arg1 on these architectures.

Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240831041934.1629216-4-pulehui@huaweicloud.com
---
 tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c | 4 ----
 tools/testing/selftests/bpf/progs/bpf_syscall_macro.c           | 2 --
 2 files changed, 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c
index 2900c5e9a016..1750c29b94f8 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c
@@ -38,11 +38,7 @@ void test_bpf_syscall_macro(void)
 	/* check whether args of syscall are copied correctly */
 	prctl(exp_arg1, exp_arg2, exp_arg3, exp_arg4, exp_arg5);
 
-#if defined(__aarch64__) || defined(__s390__)
-	ASSERT_NEQ(skel->bss->arg1, exp_arg1, "syscall_arg1");
-#else
 	ASSERT_EQ(skel->bss->arg1, exp_arg1, "syscall_arg1");
-#endif
 	ASSERT_EQ(skel->bss->arg2, exp_arg2, "syscall_arg2");
 	ASSERT_EQ(skel->bss->arg3, exp_arg3, "syscall_arg3");
 	/* it cannot copy arg4 when uses PT_REGS_PARM4 on x86_64 */
diff --git a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c
index 1a476d8ed354..9e7d9674ce2a 100644
--- a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c
+++ b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c
@@ -43,9 +43,7 @@ int BPF_KPROBE(handle_sys_prctl)
 
 	/* test for PT_REGS_PARM */
 
-#if !defined(bpf_target_arm64) && !defined(bpf_target_s390)
 	bpf_probe_read_kernel(&tmp, sizeof(tmp), &PT_REGS_PARM1_SYSCALL(real_regs));
-#endif
 	arg1 = tmp;
 	bpf_probe_read_kernel(&arg2, sizeof(arg2), &PT_REGS_PARM2_SYSCALL(real_regs));
 	bpf_probe_read_kernel(&arg3, sizeof(arg3), &PT_REGS_PARM3_SYSCALL(real_regs));
-- 
cgit v1.2.3


From 99857422338b67f0a2927cbc44fdaa8783717858 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Sat, 31 Aug 2024 04:19:34 +0000
Subject: libbpf: Fix accessing first syscall argument on RV64

On RV64, as Ilya mentioned before [0], the first syscall parameter should be
accessed through orig_a0 (see arch/riscv64/include/asm/syscall.h),
otherwise it will cause selftests like bpf_syscall_macro, vmlinux,
test_lsm, etc. to fail on RV64. Let's fix it by using the struct pt_regs
style CO-RE direct access.

Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220209021745.2215452-1-iii@linux.ibm.com [0]
Link: https://lore.kernel.org/bpf/20240831041934.1629216-5-pulehui@huaweicloud.com
---
 tools/lib/bpf/bpf_tracing.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index c15831c98990..bad7a0dbf510 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -351,6 +351,10 @@ struct pt_regs___arm64 {
  * https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions
  */
 
+struct pt_regs___riscv {
+	unsigned long orig_a0;
+} __attribute__((preserve_access_index));
+
 /* riscv provides struct user_regs_struct instead of struct pt_regs to userspace */
 #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x))
 #define __PT_PARM1_REG a0
@@ -362,12 +366,15 @@ struct pt_regs___arm64 {
 #define __PT_PARM7_REG a6
 #define __PT_PARM8_REG a7
 
-#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM1_SYSCALL_REG orig_a0
 #define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
 #define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
 #define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
 #define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
 #define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___riscv *)(x))->__PT_PARM1_SYSCALL_REG)
+#define PT_REGS_PARM1_CORE_SYSCALL(x) \
+	BPF_CORE_READ((const struct pt_regs___riscv *)(x), __PT_PARM1_SYSCALL_REG)
 
 #define __PT_RET_REG ra
 #define __PT_FP_REG s0
-- 
cgit v1.2.3


From 5e5cc1eb65256e6017e3deec04f9806f2f317853 Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Mon, 2 Sep 2024 12:21:03 +0800
Subject: tools: hv: rm .*.cmd when make clean

rm .*.cmd when make clean

Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Link: https://lore.kernel.org/r/20240902042103.5867-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20240902042103.5867-1-zhangjiao2@cmss.chinamobile.com>
---
 tools/hv/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/hv/Makefile b/tools/hv/Makefile
index 2e60e2c212cd..34ffcec264ab 100644
--- a/tools/hv/Makefile
+++ b/tools/hv/Makefile
@@ -52,7 +52,7 @@ $(OUTPUT)hv_fcopy_uio_daemon: $(HV_FCOPY_UIO_DAEMON_IN)
 
 clean:
 	rm -f $(ALL_PROGRAMS)
-	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete
+	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete
 
 install: $(ALL_PROGRAMS)
 	install -d -m 755 $(DESTDIR)$(sbindir); \
-- 
cgit v1.2.3


From 05144ab7b7eaf531fc728fcb79dcf36b621ff42d Mon Sep 17 00:00:00 2001
From: "Nícolas F. R. A. Prado" <nfraprado@collabora.com>
Date: Mon, 29 Jul 2024 16:56:02 -0400
Subject: kselftest: dt: Ignore nodes that have ancestors disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Filter out nodes that have one of its ancestors disabled as they aren't
expected to probe.

This removes the following false-positive failures on the
sc7180-trogdor-lazor-limozeen-nots-r5 platform:

/soc@0/geniqup@8c0000/i2c@894000/proximity@28
/soc@0/geniqup@ac0000/spi@a90000/ec@0
/soc@0/remoteproc@62400000/glink-edge/apr
/soc@0/remoteproc@62400000/glink-edge/apr/service@3
/soc@0/remoteproc@62400000/glink-edge/apr/service@4
/soc@0/remoteproc@62400000/glink-edge/apr/service@4/clock-controller
/soc@0/remoteproc@62400000/glink-edge/apr/service@4/dais
/soc@0/remoteproc@62400000/glink-edge/apr/service@7
/soc@0/remoteproc@62400000/glink-edge/apr/service@7/dais
/soc@0/remoteproc@62400000/glink-edge/apr/service@8
/soc@0/remoteproc@62400000/glink-edge/apr/service@8/routing
/soc@0/remoteproc@62400000/glink-edge/fastrpc
/soc@0/remoteproc@62400000/glink-edge/fastrpc/compute-cb@3
/soc@0/remoteproc@62400000/glink-edge/fastrpc/compute-cb@4
/soc@0/remoteproc@62400000/glink-edge/fastrpc/compute-cb@5
/soc@0/spmi@c440000/pmic@0/pon@800/pwrkey

Fixes: 14571ab1ad21 ("kselftest: Add new test for detecting unprobed Devicetree devices")
Signed-off-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Link: https://lore.kernel.org/r/20240729-dt-kselftest-parent-disabled-v2-1-d7a001c4930d@collabora.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 tools/testing/selftests/dt/test_unprobed_devices.sh | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/dt/test_unprobed_devices.sh b/tools/testing/selftests/dt/test_unprobed_devices.sh
index 2d7e70c5ad2d..5e3f42ef249e 100755
--- a/tools/testing/selftests/dt/test_unprobed_devices.sh
+++ b/tools/testing/selftests/dt/test_unprobed_devices.sh
@@ -34,8 +34,21 @@ nodes_compatible=$(
 		# Check if node is available
 		if [[ -e "${node}"/status ]]; then
 			status=$(tr -d '\000' < "${node}"/status)
-			[[ "${status}" != "okay" && "${status}" != "ok" ]] && continue
+			if [[ "${status}" != "okay" && "${status}" != "ok" ]]; then
+				if [ -n "${disabled_nodes_regex}" ]; then
+					disabled_nodes_regex="${disabled_nodes_regex}|${node}"
+				else
+					disabled_nodes_regex="${node}"
+				fi
+				continue
+			fi
 		fi
+
+		# Ignore this node if one of its ancestors was disabled
+		if [ -n "${disabled_nodes_regex}" ]; then
+			echo "${node}" | grep -q -E "${disabled_nodes_regex}" && continue
+		fi
+
 		echo "${node}" | sed -e 's|\/proc\/device-tree||'
 	done | sort
 	)
-- 
cgit v1.2.3


From 04b01625da130c7521b768996cd5e48052198b97 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 3 Sep 2024 10:46:00 -0700
Subject: perf/uprobe: split uprobe_unregister()

With uprobe_unregister() having grown a synchronize_srcu(), it becomes
fairly slow to call. Esp. since both users of this API call it in a
loop.

Peel off the sync_srcu() and do it once, after the loop.

We also need to add uprobe_unregister_sync() into uprobe_register()'s
error handling path, as we need to be careful about returning to the
caller before we have a guarantee that partially attached consumer won't
be called anymore. This is an unlikely slow path and this should be
totally fine to be slow in the case of a failed attach.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Co-developed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240903174603.3554182-6-andrii@kernel.org
---
 include/linux/uprobes.h                             |  8 ++++++--
 kernel/events/uprobes.c                             | 21 +++++++++++++++------
 kernel/trace/bpf_trace.c                            |  5 ++++-
 kernel/trace/trace_uprobe.c                         |  6 +++++-
 .../testing/selftests/bpf/bpf_testmod/bpf_testmod.c |  3 ++-
 5 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f112b56e9479..2b294bf1881f 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -115,7 +115,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc);
+extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
+extern void uprobe_unregister_sync(void);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
 extern void uprobe_start_dup_mmap(void);
@@ -164,7 +165,10 @@ uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
 	return -ENOSYS;
 }
 static inline void
-uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+}
+static inline void uprobe_unregister_sync(void)
 {
 }
 static inline int uprobe_mmap(struct vm_area_struct *vma)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e15c0306e535..694f6790e848 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1105,11 +1105,11 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 }
 
 /**
- * uprobe_unregister - unregister an already registered probe.
+ * uprobe_unregister_nosync - unregister an already registered probe.
  * @uprobe: uprobe to remove
  * @uc: identify which probe if multiple probes are colocated.
  */
-void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 	int err;
 
@@ -1121,12 +1121,15 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 	/* TODO : cant unregister? schedule a worker thread */
 	if (unlikely(err)) {
 		uprobe_warn(current, "unregister, leaking uprobe");
-		goto out_sync;
+		return;
 	}
 
 	put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
 
-out_sync:
+void uprobe_unregister_sync(void)
+{
 	/*
 	 * Now that handler_chain() and handle_uretprobe_chain() iterate over
 	 * uprobe->consumers list under RCU protection without holding
@@ -1138,7 +1141,7 @@ out_sync:
 	 */
 	synchronize_srcu(&uprobes_srcu);
 }
-EXPORT_SYMBOL_GPL(uprobe_unregister);
+EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
 
 /**
  * uprobe_register - register a probe
@@ -1196,7 +1199,13 @@ struct uprobe *uprobe_register(struct inode *inode,
 	up_write(&uprobe->register_rwsem);
 
 	if (ret) {
-		uprobe_unregister(uprobe, uc);
+		uprobe_unregister_nosync(uprobe, uc);
+		/*
+		 * Registration might have partially succeeded, so we can have
+		 * this consumer being called right at this time. We need to
+		 * sync here. It's ok, it's unlikely slow path.
+		 */
+		uprobe_unregister_sync();
 		return ERR_PTR(ret);
 	}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c99bf0617602..ac0a01cc8634 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3184,7 +3184,10 @@ static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
 	u32 i;
 
 	for (i = 0; i < cnt; i++)
-		uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer);
+		uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);
+
+	if (cnt)
+		uprobe_unregister_sync();
 }
 
 static void bpf_uprobe_multi_link_release(struct bpf_link *link)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7eb79e0a5352..f7443e996b1b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1097,6 +1097,7 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
 static void __probe_event_disable(struct trace_probe *tp)
 {
 	struct trace_uprobe *tu;
+	bool sync = false;
 
 	tu = container_of(tp, struct trace_uprobe, tp);
 	WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
@@ -1105,9 +1106,12 @@ static void __probe_event_disable(struct trace_probe *tp)
 		if (!tu->uprobe)
 			continue;
 
-		uprobe_unregister(tu->uprobe, &tu->consumer);
+		uprobe_unregister_nosync(tu->uprobe, &tu->consumer);
+		sync = true;
 		tu->uprobe = NULL;
 	}
+	if (sync)
+		uprobe_unregister_sync();
 }
 
 static int probe_event_enable(struct trace_event_call *call,
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 3c0515a27842..1fc16657cf42 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -475,7 +475,8 @@ static void testmod_unregister_uprobe(void)
 	mutex_lock(&testmod_uprobe_mutex);
 
 	if (uprobe.uprobe) {
-		uprobe_unregister(uprobe.uprobe, &uprobe.consumer);
+		uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer);
+		uprobe_unregister_sync();
 		path_put(&uprobe.path);
 		uprobe.uprobe = NULL;
 	}
-- 
cgit v1.2.3


From dfc58f467f7163b28d8aaadafd86d93dd08ea01c Mon Sep 17 00:00:00 2001
From: zhangjiao <zhangjiao2@cmss.chinamobile.com>
Date: Thu, 29 Aug 2024 13:33:09 +0800
Subject: tools: iio: rm .*.cmd when make clean

rm .*.cmd when make clean

Signed-off-by: zhangjiao <zhangjiao2@cmss.chinamobile.com>
Link: https://patch.msgid.link/20240829053309.10563-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 tools/iio/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/iio/Makefile b/tools/iio/Makefile
index fa720f062229..3bcce0b7d10f 100644
--- a/tools/iio/Makefile
+++ b/tools/iio/Makefile
@@ -58,7 +58,7 @@ $(OUTPUT)iio_generic_buffer: $(IIO_GENERIC_BUFFER_IN)
 clean:
 	rm -f $(ALL_PROGRAMS)
 	rm -rf $(OUTPUT)include/linux/iio
-	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete
+	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete
 
 install: $(ALL_PROGRAMS)
 	install -d -m 755 $(DESTDIR)$(bindir);		\
-- 
cgit v1.2.3


From 0b0bb453716f1469105f075044c68ee9670fe4ee Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Sep 2024 14:51:22 +0300
Subject: selftests/bpf: Add child argument to spawn_child function

Adding child argument to spawn_child function to allow
to create multiple children in following change.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240905115124.1503998-3-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 85 ++++++++++------------
 1 file changed, 39 insertions(+), 46 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index acb62675ff65..250eb47c68f9 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -68,29 +68,27 @@ static void kick_child(struct child *child)
 	fflush(NULL);
 }
 
-static struct child *spawn_child(void)
+static int spawn_child(struct child *child)
 {
-	static struct child child;
-	int err;
-	int c;
+	int err, c;
 
 	/* pipe to notify child to execute the trigger functions */
-	if (pipe(child.go))
-		return NULL;
+	if (pipe(child->go))
+		return -1;
 
-	child.pid = child.tid = fork();
-	if (child.pid < 0) {
-		release_child(&child);
+	child->pid = child->tid = fork();
+	if (child->pid < 0) {
+		release_child(child);
 		errno = EINVAL;
-		return NULL;
+		return -1;
 	}
 
 	/* child */
-	if (child.pid == 0) {
-		close(child.go[1]);
+	if (child->pid == 0) {
+		close(child->go[1]);
 
 		/* wait for parent's kick */
-		err = read(child.go[0], &c, 1);
+		err = read(child->go[0], &c, 1);
 		if (err != 1)
 			exit(err);
 
@@ -102,7 +100,7 @@ static struct child *spawn_child(void)
 		exit(errno);
 	}
 
-	return &child;
+	return 0;
 }
 
 static void *child_thread(void *ctx)
@@ -131,39 +129,38 @@ static void *child_thread(void *ctx)
 	pthread_exit(&err);
 }
 
-static struct child *spawn_thread(void)
+static int spawn_thread(struct child *child)
 {
-	static struct child child;
 	int c, err;
 
 	/* pipe to notify child to execute the trigger functions */
-	if (pipe(child.go))
-		return NULL;
+	if (pipe(child->go))
+		return -1;
 	/* pipe to notify parent that child thread is ready */
-	if (pipe(child.c2p)) {
-		close(child.go[0]);
-		close(child.go[1]);
-		return NULL;
+	if (pipe(child->c2p)) {
+		close(child->go[0]);
+		close(child->go[1]);
+		return -1;
 	}
 
-	child.pid = getpid();
+	child->pid = getpid();
 
-	err = pthread_create(&child.thread, NULL, child_thread, &child);
+	err = pthread_create(&child->thread, NULL, child_thread, child);
 	if (err) {
 		err = -errno;
-		close(child.go[0]);
-		close(child.go[1]);
-		close(child.c2p[0]);
-		close(child.c2p[1]);
+		close(child->go[0]);
+		close(child->go[1]);
+		close(child->c2p[0]);
+		close(child->c2p[1]);
 		errno = -err;
-		return NULL;
+		return -1;
 	}
 
-	err = read(child.c2p[0], &c, 1);
+	err = read(child->c2p[0], &c, 1);
 	if (!ASSERT_EQ(err, 1, "child_thread_ready"))
-		return NULL;
+		return -1;
 
-	return &child;
+	return 0;
 }
 
 static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child)
@@ -304,24 +301,22 @@ cleanup:
 static void
 test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_multi_opts *opts)
 {
-	struct child *child;
+	static struct child child;
 
 	/* no pid filter */
 	__test_attach_api(binary, pattern, opts, NULL);
 
 	/* pid filter */
-	child = spawn_child();
-	if (!ASSERT_OK_PTR(child, "spawn_child"))
+	if (!ASSERT_OK(spawn_child(&child), "spawn_child"))
 		return;
 
-	__test_attach_api(binary, pattern, opts, child);
+	__test_attach_api(binary, pattern, opts, &child);
 
 	/* pid filter (thread) */
-	child = spawn_thread();
-	if (!ASSERT_OK_PTR(child, "spawn_thread"))
+	if (!ASSERT_OK(spawn_thread(&child), "spawn_thread"))
 		return;
 
-	__test_attach_api(binary, pattern, opts, child);
+	__test_attach_api(binary, pattern, opts, &child);
 }
 
 static void test_attach_api_pattern(void)
@@ -712,24 +707,22 @@ cleanup:
 
 static void test_link_api(void)
 {
-	struct child *child;
+	static struct child child;
 
 	/* no pid filter */
 	__test_link_api(NULL);
 
 	/* pid filter */
-	child = spawn_child();
-	if (!ASSERT_OK_PTR(child, "spawn_child"))
+	if (!ASSERT_OK(spawn_child(&child), "spawn_child"))
 		return;
 
-	__test_link_api(child);
+	__test_link_api(&child);
 
 	/* pid filter (thread) */
-	child = spawn_thread();
-	if (!ASSERT_OK_PTR(child, "spawn_thread"))
+	if (!ASSERT_OK(spawn_thread(&child), "spawn_thread"))
 		return;
 
-	__test_link_api(child);
+	__test_link_api(&child);
 }
 
 static struct bpf_program *
-- 
cgit v1.2.3


From 8df43e859454952b0dd18c5821b32e6b2d48fddd Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Sep 2024 14:51:23 +0300
Subject: selftests/bpf: Add uprobe multi pid filter test for fork-ed processes

The idea is to create and monitor 3 uprobes, each trigered in separate
process and make sure the bpf program gets executed just for the proper
PID specified via pid filter.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240905115124.1503998-4-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 67 ++++++++++++++++++++++
 .../selftests/bpf/progs/uprobe_multi_pid_filter.c  | 40 +++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_pid_filter.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 250eb47c68f9..9c2f99233304 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -7,6 +7,7 @@
 #include "uprobe_multi_bench.skel.h"
 #include "uprobe_multi_usdt.skel.h"
 #include "uprobe_multi_consumers.skel.h"
+#include "uprobe_multi_pid_filter.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "testing_helpers.h"
 #include "../sdt.h"
@@ -935,6 +936,70 @@ static void test_consumers(void)
 	uprobe_multi_consumers__destroy(skel);
 }
 
+static struct bpf_program *uprobe_multi_program(struct uprobe_multi_pid_filter *skel, int idx)
+{
+	switch (idx) {
+	case 0: return skel->progs.uprobe_multi_0;
+	case 1: return skel->progs.uprobe_multi_1;
+	case 2: return skel->progs.uprobe_multi_2;
+	}
+	return NULL;
+}
+
+#define TASKS 3
+
+static void run_pid_filter(struct uprobe_multi_pid_filter *skel, bool retprobe)
+{
+	LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, .retprobe = retprobe);
+	struct bpf_link *link[TASKS] = {};
+	struct child child[TASKS] = {};
+	int i;
+
+	memset(skel->bss->test, 0, sizeof(skel->bss->test));
+
+	for (i = 0; i < TASKS; i++) {
+		if (!ASSERT_OK(spawn_child(&child[i]), "spawn_child"))
+			goto cleanup;
+		skel->bss->pids[i] = child[i].pid;
+	}
+
+	for (i = 0; i < TASKS; i++) {
+		link[i] = bpf_program__attach_uprobe_multi(uprobe_multi_program(skel, i),
+							   child[i].pid, "/proc/self/exe",
+							   "uprobe_multi_func_1", &opts);
+		if (!ASSERT_OK_PTR(link[i], "bpf_program__attach_uprobe_multi"))
+			goto cleanup;
+	}
+
+	for (i = 0; i < TASKS; i++)
+		kick_child(&child[i]);
+
+	for (i = 0; i < TASKS; i++) {
+		ASSERT_EQ(skel->bss->test[i][0], 1, "pid");
+		ASSERT_EQ(skel->bss->test[i][1], 0, "unknown");
+	}
+
+cleanup:
+	for (i = 0; i < TASKS; i++)
+		bpf_link__destroy(link[i]);
+	for (i = 0; i < TASKS; i++)
+		release_child(&child[i]);
+}
+
+static void test_pid_filter_process(void)
+{
+	struct uprobe_multi_pid_filter *skel;
+
+	skel = uprobe_multi_pid_filter__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi_pid_filter__open_and_load"))
+		return;
+
+	run_pid_filter(skel, false);
+	run_pid_filter(skel, true);
+
+	uprobe_multi_pid_filter__destroy(skel);
+}
+
 static void test_bench_attach_uprobe(void)
 {
 	long attach_start_ns = 0, attach_end_ns = 0;
@@ -1027,4 +1092,6 @@ void test_uprobe_multi_test(void)
 		test_attach_uprobe_fails();
 	if (test__start_subtest("consumers"))
 		test_consumers();
+	if (test__start_subtest("filter_fork"))
+		test_pid_filter_process();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_pid_filter.c b/tools/testing/selftests/bpf/progs/uprobe_multi_pid_filter.c
new file mode 100644
index 000000000000..67fcbad36661
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_pid_filter.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 pids[3];
+__u32 test[3][2];
+
+static void update_pid(int idx)
+{
+	__u32 pid = bpf_get_current_pid_tgid() >> 32;
+
+	if (pid == pids[idx])
+		test[idx][0]++;
+	else
+		test[idx][1]++;
+}
+
+SEC("uprobe.multi")
+int uprobe_multi_0(struct pt_regs *ctx)
+{
+	update_pid(0);
+	return 0;
+}
+
+SEC("uprobe.multi")
+int uprobe_multi_1(struct pt_regs *ctx)
+{
+	update_pid(1);
+	return 0;
+}
+
+SEC("uprobe.multi")
+int uprobe_multi_2(struct pt_regs *ctx)
+{
+	update_pid(2);
+	return 0;
+}
-- 
cgit v1.2.3


From d2520bdb1932fe615d87458ea2df2495bb25c886 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Sep 2024 14:51:24 +0300
Subject: selftests/bpf: Add uprobe multi pid filter test for clone-ed
 processes

The idea is to run same test as for test_pid_filter_process, but instead
of standard fork-ed process we create the process with clone(CLONE_VM..)
to make sure the thread leader process filter works properly in this case.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240905115124.1503998-5-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 66 ++++++++++++++--------
 1 file changed, 42 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 9c2f99233304..f160d01ba5da 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -40,6 +40,7 @@ struct child {
 	int pid;
 	int tid;
 	pthread_t thread;
+	char stack[65536];
 };
 
 static void release_child(struct child *child)
@@ -69,41 +70,56 @@ static void kick_child(struct child *child)
 	fflush(NULL);
 }
 
-static int spawn_child(struct child *child)
+static int child_func(void *arg)
 {
+	struct child *child = arg;
 	int err, c;
 
+	close(child->go[1]);
+
+	/* wait for parent's kick */
+	err = read(child->go[0], &c, 1);
+	if (err != 1)
+		exit(err);
+
+	uprobe_multi_func_1();
+	uprobe_multi_func_2();
+	uprobe_multi_func_3();
+	usdt_trigger();
+
+	exit(errno);
+}
+
+static int spawn_child_flag(struct child *child, bool clone_vm)
+{
 	/* pipe to notify child to execute the trigger functions */
 	if (pipe(child->go))
 		return -1;
 
-	child->pid = child->tid = fork();
+	if (clone_vm) {
+		child->pid = child->tid = clone(child_func, child->stack + sizeof(child->stack)/2,
+						CLONE_VM|SIGCHLD, child);
+	} else {
+		child->pid = child->tid = fork();
+	}
 	if (child->pid < 0) {
 		release_child(child);
 		errno = EINVAL;
 		return -1;
 	}
 
-	/* child */
-	if (child->pid == 0) {
-		close(child->go[1]);
-
-		/* wait for parent's kick */
-		err = read(child->go[0], &c, 1);
-		if (err != 1)
-			exit(err);
-
-		uprobe_multi_func_1();
-		uprobe_multi_func_2();
-		uprobe_multi_func_3();
-		usdt_trigger();
-
-		exit(errno);
-	}
+	/* fork-ed child */
+	if (!clone_vm && child->pid == 0)
+		child_func(child);
 
 	return 0;
 }
 
+static int spawn_child(struct child *child)
+{
+	return spawn_child_flag(child, false);
+}
+
 static void *child_thread(void *ctx)
 {
 	struct child *child = ctx;
@@ -948,7 +964,7 @@ static struct bpf_program *uprobe_multi_program(struct uprobe_multi_pid_filter *
 
 #define TASKS 3
 
-static void run_pid_filter(struct uprobe_multi_pid_filter *skel, bool retprobe)
+static void run_pid_filter(struct uprobe_multi_pid_filter *skel, bool clone_vm, bool retprobe)
 {
 	LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, .retprobe = retprobe);
 	struct bpf_link *link[TASKS] = {};
@@ -958,7 +974,7 @@ static void run_pid_filter(struct uprobe_multi_pid_filter *skel, bool retprobe)
 	memset(skel->bss->test, 0, sizeof(skel->bss->test));
 
 	for (i = 0; i < TASKS; i++) {
-		if (!ASSERT_OK(spawn_child(&child[i]), "spawn_child"))
+		if (!ASSERT_OK(spawn_child_flag(&child[i], clone_vm), "spawn_child"))
 			goto cleanup;
 		skel->bss->pids[i] = child[i].pid;
 	}
@@ -986,7 +1002,7 @@ cleanup:
 		release_child(&child[i]);
 }
 
-static void test_pid_filter_process(void)
+static void test_pid_filter_process(bool clone_vm)
 {
 	struct uprobe_multi_pid_filter *skel;
 
@@ -994,8 +1010,8 @@ static void test_pid_filter_process(void)
 	if (!ASSERT_OK_PTR(skel, "uprobe_multi_pid_filter__open_and_load"))
 		return;
 
-	run_pid_filter(skel, false);
-	run_pid_filter(skel, true);
+	run_pid_filter(skel, clone_vm, false);
+	run_pid_filter(skel, clone_vm, true);
 
 	uprobe_multi_pid_filter__destroy(skel);
 }
@@ -1093,5 +1109,7 @@ void test_uprobe_multi_test(void)
 	if (test__start_subtest("consumers"))
 		test_consumers();
 	if (test__start_subtest("filter_fork"))
-		test_pid_filter_process();
+		test_pid_filter_process(false);
+	if (test__start_subtest("filter_clone_vm"))
+		test_pid_filter_process(true);
 }
-- 
cgit v1.2.3


From 5db0ba6766f8a6606e655ddad745c87bc01349c7 Mon Sep 17 00:00:00 2001
From: Lin Yikai <yikai.lin@vivo.com>
Date: Thu, 5 Sep 2024 19:03:05 +0800
Subject: selftests/bpf: fix some typos in selftests

Hi, fix some spelling errors in selftest, the details are as follows:

-in the codes:
	test_bpf_sk_stoarge_map_iter_fd(void)
		->test_bpf_sk_storage_map_iter_fd(void)
	load BTF from btf_data.o->load BTF from btf_data.bpf.o

-in the code comments:
	preample->preamble
	multi-contollers->multi-controllers
	errono->errno
	unsighed/unsinged->unsigned
	egree->egress
	shoud->should
	regsiter->register
	assummed->assumed
	conditiona->conditional
	rougly->roughly
	timetamp->timestamp
	ingores->ignores
	null-termainted->null-terminated
	slepable->sleepable
	implemenation->implementation
	veriables->variables
	timetamps->timestamps
	substitue a costant->substitute a constant
	secton->section
	unreferened->unreferenced
	verifer->verifier
	libppf->libbpf
...

Signed-off-by: Lin Yikai <yikai.lin@vivo.com>
Link: https://lore.kernel.org/r/20240905110354.3274546-1-yikai.lin@vivo.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/benchs/bench_trigger.c         |  2 +-
 tools/testing/selftests/bpf/cgroup_helpers.c               |  2 +-
 tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c |  2 +-
 .../selftests/bpf/map_tests/lpm_trie_map_batch_ops.c       |  2 +-
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c          |  4 ++--
 tools/testing/selftests/bpf/prog_tests/btf.c               |  6 +++---
 tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c  |  2 +-
 tools/testing/selftests/bpf/prog_tests/log_buf.c           |  4 ++--
 tools/testing/selftests/bpf/prog_tests/reg_bounds.c        | 14 +++++++-------
 tools/testing/selftests/bpf/prog_tests/resolve_btfids.c    |  2 +-
 tools/testing/selftests/bpf/prog_tests/tc_redirect.c       |  2 +-
 tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c    |  2 +-
 tools/testing/selftests/bpf/prog_tests/test_strncmp.c      |  2 +-
 tools/testing/selftests/bpf/prog_tests/token.c             |  4 ++--
 tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/user_ringbuf.c      |  2 +-
 tools/testing/selftests/bpf/progs/bpf_cubic.c              |  6 +++---
 tools/testing/selftests/bpf/progs/strobemeta.h             |  4 ++--
 .../testing/selftests/bpf/progs/test_cls_redirect_dynptr.c |  2 +-
 tools/testing/selftests/bpf/progs/test_core_read_macros.c  |  2 +-
 tools/testing/selftests/bpf/progs/test_global_func15.c     |  2 +-
 tools/testing/selftests/bpf/progs/test_global_map_resize.c |  2 +-
 tools/testing/selftests/bpf/test_maps.c                    |  2 +-
 tools/testing/selftests/bpf/test_progs.c                   |  2 +-
 tools/testing/selftests/bpf/verifier/map_kptr.c            |  2 +-
 25 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index a220545a3238..2ed0ef6f21ee 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -276,7 +276,7 @@ static void trigger_rawtp_setup(void)
  * instructions. So use two different targets, one of which starts with nop
  * and another doesn't.
  *
- * GCC doesn't generate stack setup preample for these functions due to them
+ * GCC doesn't generate stack setup preamble for these functions due to them
  * having no input arguments and doing nothing in the body.
  */
 __nocf_check __weak void uprobe_target_nop(void)
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c
index 23bb9a9e6a7d..e4535451322e 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.c
+++ b/tools/testing/selftests/bpf/cgroup_helpers.c
@@ -644,7 +644,7 @@ unsigned long long get_classid_cgroup_id(void)
 /**
  * get_cgroup1_hierarchy_id - Retrieves the ID of a cgroup1 hierarchy from the cgroup1 subsys name.
  * @subsys_name: The cgroup1 subsys name, which can be retrieved from /proc/self/cgroup. It can be
- * a named cgroup like "name=systemd", a controller name like "net_cls", or multi-contollers like
+ * a named cgroup like "name=systemd", a controller name like "net_cls", or multi-controllers like
  * "net_cls,net_prio".
  */
 int get_cgroup1_hierarchy_id(const char *subsys_name)
diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
index 1230ccf90128..5da493b94ae2 100644
--- a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
@@ -197,7 +197,7 @@ void __test_map_lookup_and_delete_batch(bool is_pcpu)
 		CHECK(total != max_entries, "delete with steps",
 		      "total = %u, max_entries = %u\n", total, max_entries);
 
-		/* check map is empty, errono == ENOENT */
+		/* check map is empty, errno == ENOENT */
 		err = bpf_map_get_next_key(map_fd, NULL, &key);
 		CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()",
 		      "error: %s\n", strerror(errno));
diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
index b66d56ddb7ef..fe3e19f96244 100644
--- a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
@@ -135,7 +135,7 @@ void test_lpm_trie_map_batch_ops(void)
 		CHECK(total != max_entries, "delete with steps",
 		      "total = %u, max_entries = %u\n", total, max_entries);
 
-		/* check map is empty, errono == ENOENT */
+		/* check map is empty, errno == ENOENT */
 		err = bpf_map_get_next_key(map_fd, NULL, &key);
 		CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()",
 		      "error: %s\n", strerror(errno));
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 618af9dfae9b..52e6f7570475 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -1218,7 +1218,7 @@ out:
 	bpf_iter_bpf_sk_storage_helpers__destroy(skel);
 }
 
-static void test_bpf_sk_stoarge_map_iter_fd(void)
+static void test_bpf_sk_storage_map_iter_fd(void)
 {
 	struct bpf_iter_bpf_sk_storage_map *skel;
 
@@ -1693,7 +1693,7 @@ void test_bpf_iter(void)
 	if (test__start_subtest("bpf_sk_storage_map"))
 		test_bpf_sk_storage_map();
 	if (test__start_subtest("bpf_sk_storage_map_iter_fd"))
-		test_bpf_sk_stoarge_map_iter_fd();
+		test_bpf_sk_storage_map_iter_fd();
 	if (test__start_subtest("bpf_sk_storage_delete"))
 		test_bpf_sk_storage_delete();
 	if (test__start_subtest("bpf_sk_storage_get"))
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 00965a6e83bb..7eafcf91b02e 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -4986,7 +4986,7 @@ struct pprint_mapv_int128 {
 static struct btf_raw_test pprint_test_template[] = {
 {
 	.raw_types = {
-		/* unsighed char */			/* [1] */
+		/* unsigned char */			/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1),
 		/* unsigned short */			/* [2] */
 		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2),
@@ -5053,7 +5053,7 @@ static struct btf_raw_test pprint_test_template[] = {
 	 * be encoded with kind_flag set.
 	 */
 	.raw_types = {
-		/* unsighed char */			/* [1] */
+		/* unsigned char */			/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1),
 		/* unsigned short */			/* [2] */
 		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2),
@@ -5120,7 +5120,7 @@ static struct btf_raw_test pprint_test_template[] = {
 	 * will have both int and enum types.
 	 */
 	.raw_types = {
-		/* unsighed char */			/* [1] */
+		/* unsigned char */			/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1),
 		/* unsigned short */			/* [2] */
 		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2),
diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
index 63ee892bc757..10224f845568 100644
--- a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
@@ -214,7 +214,7 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd)
 	/* Attach to parent and child cgroup, trigger packet from child.
 	 * Assert that there is six additional runs, parent cgroup egresses and
 	 * ingress, child cgroup egresses and ingress.
-	 * Assert that egree and ingress storages are separate.
+	 * Assert that egress and ingress storages are separate.
 	 */
 	child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
 							child_cgroup_fd);
diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c
index 46611040dec3..27676a04d0b6 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_buf.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c
@@ -160,7 +160,7 @@ static void bpf_prog_load_log_buf(void)
 	opts.log_buf = log_buf;
 	opts.log_size = log_buf_sz;
 
-	/* with log_level == 0 log_buf shoud stay empty for good prog */
+	/* with log_level == 0 log_buf should stay empty for good prog */
 	log_buf[0] = '\0';
 	opts.log_level = 0;
 	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "good_prog", "GPL",
@@ -222,7 +222,7 @@ static void bpf_btf_load_log_buf(void)
 	opts.log_buf = log_buf;
 	opts.log_size = log_buf_sz;
 
-	/* with log_level == 0 log_buf shoud stay empty for good BTF */
+	/* with log_level == 0 log_buf should stay empty for good BTF */
 	log_buf[0] = '\0';
 	opts.log_level = 0;
 	fd = bpf_btf_load(raw_btf_data, raw_btf_size, &opts);
diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
index 467027236d30..39d42271cc46 100644
--- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
+++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
@@ -503,7 +503,7 @@ static const char *op_str(enum op op)
 
 /* Can register with range [x.a, x.b] *EVER* satisfy
  * OP (<, <=, >, >=, ==, !=) relation to
- * a regsiter with range [y.a, y.b]
+ * a register with range [y.a, y.b]
  * _in *num_t* domain_
  */
 static bool range_canbe_op(enum num_t t, struct range x, struct range y, enum op op)
@@ -532,7 +532,7 @@ static bool range_canbe_op(enum num_t t, struct range x, struct range y, enum op
 
 /* Does register with range [x.a, x.b] *ALWAYS* satisfy
  * OP (<, <=, >, >=, ==, !=) relation to
- * a regsiter with range [y.a, y.b]
+ * a register with range [y.a, y.b]
  * _in *num_t* domain_
  */
 static bool range_always_op(enum num_t t, struct range x, struct range y, enum op op)
@@ -543,7 +543,7 @@ static bool range_always_op(enum num_t t, struct range x, struct range y, enum o
 
 /* Does register with range [x.a, x.b] *NEVER* satisfy
  * OP (<, <=, >, >=, ==, !=) relation to
- * a regsiter with range [y.a, y.b]
+ * a register with range [y.a, y.b]
  * _in *num_t* domain_
  */
 static bool range_never_op(enum num_t t, struct range x, struct range y, enum op op)
@@ -1018,11 +1018,11 @@ static int parse_reg_state(const char *s, struct reg_state *reg)
 	 *     - umin=%llu, if missing, assumed 0;
 	 *     - umax=%llu, if missing, assumed U64_MAX;
 	 *     - smin=%lld, if missing, assumed S64_MIN;
-	 *     - smax=%lld, if missing, assummed S64_MAX;
+	 *     - smax=%lld, if missing, assumed S64_MAX;
 	 *     - umin32=%d, if missing, assumed 0;
 	 *     - umax32=%d, if missing, assumed U32_MAX;
 	 *     - smin32=%d, if missing, assumed S32_MIN;
-	 *     - smax32=%d, if missing, assummed S32_MAX;
+	 *     - smax32=%d, if missing, assumed S32_MAX;
 	 *     - var_off=(%#llx; %#llx), tnum part, we don't care about it.
 	 *
 	 * If some of the values are equal, they will be grouped (but min/max
@@ -1884,7 +1884,7 @@ cleanup:
  * envvar is not set, this test is skipped during test_progs testing.
  *
  * We split this up into smaller subsets based on initialization and
- * conditiona numeric domains to get an easy parallelization with test_progs'
+ * conditional numeric domains to get an easy parallelization with test_progs'
  * -j argument.
  */
 
@@ -1938,7 +1938,7 @@ static u64 rand_u64()
 {
 	/* RAND_MAX is guaranteed to be at least 1<<15, but in practice it
 	 * seems to be 1<<31, so we need to call it thrice to get full u64;
-	 * we'll use rougly equal split: 22 + 21 + 21 bits
+	 * we'll use roughly equal split: 22 + 21 + 21 bits
 	 */
 	return ((u64)random() << 42) |
 	       (((u64)random() & RAND_21BIT_MASK) << 21) |
diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
index f81d08d429a2..51544372f52e 100644
--- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
+++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
@@ -103,7 +103,7 @@ static int resolve_symbols(void)
 
 	btf = btf__parse_elf("btf_data.bpf.o", NULL);
 	if (CHECK(libbpf_get_error(btf), "resolve",
-		  "Failed to load BTF from btf_data.o\n"))
+		  "Failed to load BTF from btf_data.bpf.o\n"))
 		return -1;
 
 	nr = btf__type_cnt(btf);
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 974f9d6269c9..c85798966aec 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -872,7 +872,7 @@ static void test_tcp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
 	test_inet_dtime(family, SOCK_STREAM, addr, 50000 + t);
 
 	/* fwdns_prio100 prog does not read delivery_time_type, so
-	 * kernel puts the (rcv) timetamp in __sk_buff->tstamp
+	 * kernel puts the (rcv) timestamp in __sk_buff->tstamp
 	 */
 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
diff --git a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
index a0054019e677..9c0200c132d9 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
@@ -51,7 +51,7 @@ static int run_set_secureexec(int map_fd, int secureexec)
 			exit(ret);
 
 		/* If the binary is executed with securexec=1, the dynamic
-		 * loader ingores and unsets certain variables like LD_PRELOAD,
+		 * loader ignores and unsets certain variables like LD_PRELOAD,
 		 * TMPDIR etc. TMPDIR is used here to simplify the example, as
 		 * LD_PRELOAD requires a real .so file.
 		 *
diff --git a/tools/testing/selftests/bpf/prog_tests/test_strncmp.c b/tools/testing/selftests/bpf/prog_tests/test_strncmp.c
index 7ddd6615b7e7..baceb0de9d49 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_strncmp.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_strncmp.c
@@ -72,7 +72,7 @@ static void test_strncmp_ret(void)
 	got = trigger_strncmp(skel);
 	ASSERT_EQ(got, 0, "strncmp: same str");
 
-	/* Not-null-termainted string  */
+	/* Not-null-terminated string  */
 	memcpy(skel->bss->str, skel->rodata->target, sizeof(skel->bss->str));
 	skel->bss->str[sizeof(skel->bss->str) - 1] = 'A';
 	got = trigger_strncmp(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/token.c b/tools/testing/selftests/bpf/prog_tests/token.c
index fc4a175d8d76..fe86e4fdb89c 100644
--- a/tools/testing/selftests/bpf/prog_tests/token.c
+++ b/tools/testing/selftests/bpf/prog_tests/token.c
@@ -867,7 +867,7 @@ static int userns_obj_priv_implicit_token(int mnt_fd, struct token_lsm *lsm_skel
 	}
 	unsetenv(TOKEN_ENVVAR);
 
-	/* now the same struct_ops skeleton should succeed thanks to libppf
+	/* now the same struct_ops skeleton should succeed thanks to libbpf
 	 * creating BPF token from /sys/fs/bpf mount point
 	 */
 	skel = dummy_st_ops_success__open_and_load();
@@ -929,7 +929,7 @@ static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *l
 	if (!ASSERT_OK(err, "setenv_token_path"))
 		goto err_out;
 
-	/* now the same struct_ops skeleton should succeed thanks to libppf
+	/* now the same struct_ops skeleton should succeed thanks to libbpf
 	 * creating BPF token from custom mount point
 	 */
 	skel = dummy_st_ops_success__open_and_load();
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index f160d01ba5da..844f6fc8487b 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -213,7 +213,7 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child
 
 	/*
 	 * There are 2 entry and 2 exit probe called for each uprobe_multi_func_[123]
-	 * function and each slepable probe (6) increments uprobe_multi_sleep_result.
+	 * function and each sleepable probe (6) increments uprobe_multi_sleep_result.
 	 */
 	ASSERT_EQ(skel->bss->uprobe_multi_func_1_result, 2, "uprobe_multi_func_1_result");
 	ASSERT_EQ(skel->bss->uprobe_multi_func_2_result, 2, "uprobe_multi_func_2_result");
diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
index dfff6feac12c..d424e7ecbd12 100644
--- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
@@ -643,7 +643,7 @@ static void test_user_ringbuf_blocking_reserve(void)
 	if (!ASSERT_EQ(err, 0, "deferred_kick_thread\n"))
 		goto cleanup;
 
-	/* After spawning another thread that asychronously kicks the kernel to
+	/* After spawning another thread that asynchronously kicks the kernel to
 	 * drain the messages, we're able to block and successfully get a
 	 * sample once we receive an event notification.
 	 */
diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
index d665b8a15cc4..f089faa97ae6 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-/* WARNING: This implemenation is not necessarily the same
+/* WARNING: This implementation is not necessarily the same
  * as the tcp_cubic.c.  The purpose is mainly for testing
  * the kernel BPF logic.
  *
@@ -314,7 +314,7 @@ static void bictcp_update(struct bpf_bictcp *ca, __u32 cwnd, __u32 acked)
 	 * (so time^3 is done by using 64 bit)
 	 * and without the support of division of 64bit numbers
 	 * (so all divisions are done by using 32 bit)
-	 *  also NOTE the unit of those veriables
+	 *  also NOTE the unit of those variables
 	 *	  time  = (t - K) / 2^bictcp_HZ
 	 *	  c = bic_scale >> 10
 	 * rtt  = (srtt >> 3) / HZ
@@ -507,7 +507,7 @@ void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample)
 	__u32 delay;
 
 	bpf_cubic_acked_called = 1;
-	/* Some calls are for duplicates without timetamps */
+	/* Some calls are for duplicates without timestamps */
 	if (sample->rtt_us < 0)
 		return;
 
diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h
index f74459eead26..a5c74d31a244 100644
--- a/tools/testing/selftests/bpf/progs/strobemeta.h
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h
@@ -373,7 +373,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
 	len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr);
 	/*
 	 * if bpf_probe_read_user_str returns error (<0), due to casting to
-	 * unsinged int, it will become big number, so next check is
+	 * unsigned int, it will become big number, so next check is
 	 * sufficient to check for errors AND prove to BPF verifier, that
 	 * bpf_probe_read_user_str won't return anything bigger than
 	 * STROBE_MAX_STR_LEN
@@ -557,7 +557,7 @@ static void *read_strobe_meta(struct task_struct *task,
 		return NULL;
 
 	payload_off = ctx.payload_off;
-	/* this should not really happen, here only to satisfy verifer */
+	/* this should not really happen, here only to satisfy verifier */
 	if (payload_off > sizeof(data->payload))
 		payload_off = sizeof(data->payload);
 #else
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c
index da54c09e9a15..464515b824b9 100644
--- a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c
@@ -503,7 +503,7 @@ static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_header
  *
  *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
  *
- * clang will substitue a costant for sizeof, which allows the verifier
+ * clang will substitute a constant for sizeof, which allows the verifier
  * to track it's value. Based on this, it can figure out the constant
  * return value, and calling code works while still being "generic" to
  * IPv4 and IPv6.
diff --git a/tools/testing/selftests/bpf/progs/test_core_read_macros.c b/tools/testing/selftests/bpf/progs/test_core_read_macros.c
index fd54caa17319..873d85a4739b 100644
--- a/tools/testing/selftests/bpf/progs/test_core_read_macros.c
+++ b/tools/testing/selftests/bpf/progs/test_core_read_macros.c
@@ -36,7 +36,7 @@ int handler(void *ctx)
 		return 0;
 
 	/* next pointers for kernel address space have to be initialized from
-	 * BPF side, user-space mmaped addresses are stil user-space addresses
+	 * BPF side, user-space mmaped addresses are still user-space addresses
 	 */
 	k_probe_in.next = &k_probe_in;
 	__builtin_preserve_access_index(({k_core_in.next = &k_core_in;}));
diff --git a/tools/testing/selftests/bpf/progs/test_global_func15.c b/tools/testing/selftests/bpf/progs/test_global_func15.c
index b4e089d6981d..201cc000b3f4 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func15.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func15.c
@@ -44,7 +44,7 @@ __naked int global_func15_tricky_pruning(void)
 		 * case we have a valid 1 stored in R0 register, but in
 		 * a branch case we assign some random value to R0.  So if
 		 * there is something wrong with precision tracking for R0 at
-		 * program exit, we might erronenously prune branch case,
+		 * program exit, we might erroneously prune branch case,
 		 * because R0 in fallthrough case is imprecise (and thus any
 		 * value is valid from POV of verifier is_state_equal() logic)
 		 */
diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c
index 714b29c7f8b2..a3f220ba7025 100644
--- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c
+++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c
@@ -16,7 +16,7 @@ const volatile size_t data_array_len;
 int sum = 0;
 int array[1];
 
-/* custom data secton */
+/* custom data section */
 int my_array[1] SEC(".data.custom");
 
 /* custom data section which should NOT be resizable,
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index dfbab214f4d1..905d5981ace1 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -1515,7 +1515,7 @@ again:
 		       value == key);
 	}
 
-	/* Now let's delete all elemenets in parallel. */
+	/* Now let's delete all elements in parallel. */
 	data[1] = DO_DELETE;
 	run_parallel(TASKS, test_update_delete, data);
 
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 83f390a31681..c7a70e1a1085 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1500,7 +1500,7 @@ static void calculate_summary_and_print_errors(struct test_env *env)
 
 	/*
 	 * We only print error logs summary when there are failed tests and
-	 * verbose mode is not enabled. Otherwise, results may be incosistent.
+	 * verbose mode is not enabled. Otherwise, results may be inconsistent.
 	 *
 	 */
 	if (!verbose() && fail_cnt) {
diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c
index d25c3e9605f1..f420c0312aa0 100644
--- a/tools/testing/selftests/bpf/verifier/map_kptr.c
+++ b/tools/testing/selftests/bpf/verifier/map_kptr.c
@@ -153,7 +153,7 @@
 	.result = REJECT,
 	.errstr = "variable untrusted_ptr_ access var_off=(0x0; 0x7) disallowed",
 },
-/* Tests for unreferened PTR_TO_BTF_ID */
+/* Tests for unreferenced PTR_TO_BTF_ID */
 {
 	"map_kptr: unref: reject btf_struct_ids_match == false",
 	.insns = {
-- 
cgit v1.2.3


From a86857d2546c98f6c4e5677af8c6b8a80edd0704 Mon Sep 17 00:00:00 2001
From: Lin Yikai <yikai.lin@vivo.com>
Date: Thu, 5 Sep 2024 19:03:06 +0800
Subject: bpftool: fix some typos in bpftool

Hi, fix some spelling errors in bpftool, the details are as follows:

-in file "bpftool-gen.rst"
	libppf->libbpf
-in the code comments:
	ouptut->output

Signed-off-by: Lin Yikai <yikai.lin@vivo.com>
Link: https://lore.kernel.org/r/20240905110354.3274546-2-yikai.lin@vivo.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpftool/Documentation/bpftool-gen.rst |  2 +-
 tools/bpf/bpftool/feature.c                     | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index c768e6d4ae09..1b426e58a7cd 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -104,7 +104,7 @@ bpftool gen skeleton *FILE*
 
     - **example__load**.
       This function creates maps, loads and verifies BPF programs, initializes
-      global data maps. It corresponds to libppf's **bpf_object__load**\ ()
+      global data maps. It corresponds to libbpf's **bpf_object__load**\ ()
       API.
 
     - **example__open_and_load** combines **example__open** and
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index c754a428c8c6..4dbc4fcdf473 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -196,7 +196,7 @@ static void probe_unprivileged_disabled(void)
 {
 	long res;
 
-	/* No support for C-style ouptut */
+	/* No support for C-style output */
 
 	res = read_procfs("/proc/sys/kernel/unprivileged_bpf_disabled");
 	if (json_output) {
@@ -225,7 +225,7 @@ static void probe_jit_enable(void)
 {
 	long res;
 
-	/* No support for C-style ouptut */
+	/* No support for C-style output */
 
 	res = read_procfs("/proc/sys/net/core/bpf_jit_enable");
 	if (json_output) {
@@ -255,7 +255,7 @@ static void probe_jit_harden(void)
 {
 	long res;
 
-	/* No support for C-style ouptut */
+	/* No support for C-style output */
 
 	res = read_procfs("/proc/sys/net/core/bpf_jit_harden");
 	if (json_output) {
@@ -285,7 +285,7 @@ static void probe_jit_kallsyms(void)
 {
 	long res;
 
-	/* No support for C-style ouptut */
+	/* No support for C-style output */
 
 	res = read_procfs("/proc/sys/net/core/bpf_jit_kallsyms");
 	if (json_output) {
@@ -311,7 +311,7 @@ static void probe_jit_limit(void)
 {
 	long res;
 
-	/* No support for C-style ouptut */
+	/* No support for C-style output */
 
 	res = read_procfs("/proc/sys/net/core/bpf_jit_limit");
 	if (json_output) {
-- 
cgit v1.2.3


From bd4d67f8ae55d42273c6cd661b622ed713d20350 Mon Sep 17 00:00:00 2001
From: Lin Yikai <yikai.lin@vivo.com>
Date: Thu, 5 Sep 2024 19:03:07 +0800
Subject: libbpf: fix some typos in libbpf

Hi, fix some spelling errors in libbpf, the details are as follows:

-in the code comments:
	termintaing->terminating
	architecutre->architecture
	requring->requiring
	recored->recoded
	sanitise->sanities
	allowd->allowed
	abover->above
	see bpf_udst_arg()->see bpf_usdt_arg()

Signed-off-by: Lin Yikai <yikai.lin@vivo.com>
Link: https://lore.kernel.org/r/20240905110354.3274546-3-yikai.lin@vivo.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf.h         | 4 ++--
 tools/lib/bpf/bpf_tracing.h | 4 ++--
 tools/lib/bpf/btf.c         | 2 +-
 tools/lib/bpf/libbpf.c      | 2 +-
 tools/lib/bpf/linker.c      | 4 ++--
 tools/lib/bpf/usdt.bpf.h    | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 972e17ec0c09..a4a7b1ad1b63 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -100,7 +100,7 @@ struct bpf_prog_load_opts {
 	__u32 log_level;
 	__u32 log_size;
 	char *log_buf;
-	/* output: actual total log contents size (including termintaing zero).
+	/* output: actual total log contents size (including terminating zero).
 	 * It could be both larger than original log_size (if log was
 	 * truncated), or smaller (if log buffer wasn't filled completely).
 	 * If kernel doesn't support this feature, log_size is left unchanged.
@@ -129,7 +129,7 @@ struct bpf_btf_load_opts {
 	char *log_buf;
 	__u32 log_level;
 	__u32 log_size;
-	/* output: actual total log contents size (including termintaing zero).
+	/* output: actual total log contents size (including terminating zero).
 	 * It could be both larger than original log_size (if log was
 	 * truncated), or smaller (if log buffer wasn't filled completely).
 	 * If kernel doesn't support this feature, log_size is left unchanged.
diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index bad7a0dbf510..4eab132a963e 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -480,7 +480,7 @@ struct pt_regs;
 #endif
 /*
  * Similarly, syscall-specific conventions might differ between function call
- * conventions within each architecutre. All supported architectures pass
+ * conventions within each architecture. All supported architectures pass
  * either 6 or 7 syscall arguments in registers.
  *
  * See syscall(2) manpage for succinct table with information on each arch.
@@ -658,7 +658,7 @@ struct pt_regs;
  * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and
  * similar kinds of BPF programs, that accept input arguments as a single
  * pointer to untyped u64 array, where each u64 can actually be a typed
- * pointer or integer of different size. Instead of requring user to write
+ * pointer or integer of different size. Instead of requiring user to write
  * manual casts and work with array elements by index, BPF_PROG macro
  * allows user to declare a list of named and typed input arguments in the
  * same syntax as for normal C function. All the casting is hidden and
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 40aae244e35f..8d51e73d55a8 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -4192,7 +4192,7 @@ static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id
  * and canonical graphs are not compatible structurally, whole graphs are
  * incompatible. If types are structurally equivalent (i.e., all information
  * except referenced type IDs is exactly the same), a mapping from `canon_id` to
- * a `cand_id` is recored in hypothetical mapping (`btf_dedup->hypot_map`).
+ * a `cand_id` is recoded in hypothetical mapping (`btf_dedup->hypot_map`).
  * If a type references other types, then those referenced types are checked
  * for equivalence recursively.
  *
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index d3a542649e6b..27ad3c6ee868 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1848,7 +1848,7 @@ static char *internal_map_name(struct bpf_object *obj, const char *real_name)
 	snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name,
 		 sfx_len, real_name);
 
-	/* sanitise map name to characters allowed by kernel */
+	/* sanities map name to characters allowed by kernel */
 	for (p = map_name; *p && p < map_name + sizeof(map_name); p++)
 		if (!isalnum(*p) && *p != '_' && *p != '.')
 			*p = '_';
diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 9cd3d4109788..e0005c6ade88 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -1413,7 +1413,7 @@ recur:
 		return true;
 	case BTF_KIND_PTR:
 		/* just validate overall shape of the referenced type, so no
-		 * contents comparison for struct/union, and allowd fwd vs
+		 * contents comparison for struct/union, and allowed fwd vs
 		 * struct/union
 		 */
 		exact = false;
@@ -1962,7 +1962,7 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj,
 
 		/* If existing symbol is a strong resolved symbol, bail out,
 		 * because we lost resolution battle have nothing to
-		 * contribute. We already checked abover that there is no
+		 * contribute. We already checked above that there is no
 		 * strong-strong conflict. We also already tightened binding
 		 * and visibility, so nothing else to contribute at that point.
 		 */
diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h
index 76359bcdc94a..b811f754939f 100644
--- a/tools/lib/bpf/usdt.bpf.h
+++ b/tools/lib/bpf/usdt.bpf.h
@@ -39,7 +39,7 @@ enum __bpf_usdt_arg_type {
 struct __bpf_usdt_arg_spec {
 	/* u64 scalar interpreted depending on arg_type, see below */
 	__u64 val_off;
-	/* arg location case, see bpf_udst_arg() for details */
+	/* arg location case, see bpf_usdt_arg() for details */
 	enum __bpf_usdt_arg_type arg_type;
 	/* offset of referenced register within struct pt_regs */
 	short reg_off;
-- 
cgit v1.2.3


From dc3a8804d790cc7dda442e5504a0c67435b356f4 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:13:52 +0000
Subject: selftests/bpf: Adapt OUTPUT appending logic to lower versions of Make

The $(let ...) function is only supported by GNU Make version 4.4 [0]
and above, otherwise the following exception file or directory will be
generated:

	tools/testing/selftests/bpfFEATURE-DUMP.selftests
	tools/testing/selftests/bpffeature/

Considering that the GNU Make version of most Linux distributions is
lower than 4.4, let us adapt the corresponding logic to it.

Link: https://lists.gnu.org/archive/html/info-gnu/2022-10/msg00008.html [0]
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-2-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7660d19b66c2..9905e3739dd0 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -187,8 +187,14 @@ FEATURE_TESTS := llvm
 FEATURE_DISPLAY := $(FEATURE_TESTS)
 
 # Makefile.feature expects OUTPUT to end with a slash
+ifeq ($(shell expr $(MAKE_VERSION) \>= 4.4), 1)
 $(let OUTPUT,$(OUTPUT)/,\
 	$(eval include ../../../build/Makefile.feature))
+else
+OUTPUT := $(OUTPUT)/
+$(eval include ../../../build/Makefile.feature)
+OUTPUT := $(patsubst %/,%,$(OUTPUT))
+endif
 endif
 
 ifeq ($(feature-llvm),1)
-- 
cgit v1.2.3


From a48a43884cdd46dba8cc4a15160c116ffd965800 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:13:53 +0000
Subject: selftests/bpf: Rename fallback in bpf_dctcp to avoid naming conflict

Recently, when compiling bpf selftests on RV64, the following
compilation failure occurred:

progs/bpf_dctcp.c:29:21: error: redefinition of 'fallback' as different kind of symbol
   29 | volatile const char fallback[TCP_CA_NAME_MAX];
      |                     ^
/workspace/tools/testing/selftests/bpf/tools/include/vmlinux.h:86812:15: note: previous definition is here
 86812 | typedef u32 (*fallback)(u32, const unsigned char *, size_t);

The reason is that the `fallback` symbol has been defined in
arch/riscv/lib/crc32.c, which will cause symbol conflicts when vmlinux.h
is included in bpf_dctcp. Let we rename `fallback` string to
`fallback_cc` in bpf_dctcp to fix this compilation failure.

Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-3-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +-
 tools/testing/selftests/bpf/progs/bpf_dctcp.c       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index 1d494b4453f4..409a06975823 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -285,7 +285,7 @@ static void test_dctcp_fallback(void)
 	dctcp_skel = bpf_dctcp__open();
 	if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel"))
 		return;
-	strcpy(dctcp_skel->rodata->fallback, "cubic");
+	strcpy(dctcp_skel->rodata->fallback_cc, "cubic");
 	if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load"))
 		goto done;
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
index 02f552e7fd4d..7cd73e75f52a 100644
--- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -26,7 +26,7 @@ static bool before(__u32 seq1, __u32 seq2)
 
 char _license[] SEC("license") = "GPL";
 
-volatile const char fallback[TCP_CA_NAME_MAX];
+volatile const char fallback_cc[TCP_CA_NAME_MAX];
 const char bpf_dctcp[] = "bpf_dctcp";
 const char tcp_cdg[] = "cdg";
 char cc_res[TCP_CA_NAME_MAX];
@@ -71,10 +71,10 @@ void BPF_PROG(bpf_dctcp_init, struct sock *sk)
 	struct bpf_dctcp *ca = inet_csk_ca(sk);
 	int *stg;
 
-	if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
+	if (!(tp->ecn_flags & TCP_ECN_OK) && fallback_cc[0]) {
 		/* Switch to fallback */
 		if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
-				   (void *)fallback, sizeof(fallback)) == -EBUSY)
+				   (void *)fallback_cc, sizeof(fallback_cc)) == -EBUSY)
 			ebusy_cnt++;
 
 		/* Switch back to myself and the recurred bpf_dctcp_init()
@@ -87,7 +87,7 @@ void BPF_PROG(bpf_dctcp_init, struct sock *sk)
 
 		/* Switch back to fallback */
 		if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
-				   (void *)fallback, sizeof(fallback)) == -EBUSY)
+				   (void *)fallback_cc, sizeof(fallback_cc)) == -EBUSY)
 			ebusy_cnt++;
 
 		/* Expecting -ENOTSUPP for tcp_cdg_res */
-- 
cgit v1.2.3


From 67ab80a01886a178c69c91728c49f94ccb71f338 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 5 Sep 2024 08:13:54 +0000
Subject: selftests/bpf: Prefer static linking for LLVM libraries

It is not always convenient to have LLVM libraries installed inside CI
rootfs images, thus request static libraries from llvm-config.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20240905081401.1894789-4-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9905e3739dd0..04716a5e43f1 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -202,11 +202,9 @@ ifeq ($(feature-llvm),1)
   LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets
   # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict
   LLVM_CFLAGS  += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags))
-  LLVM_LDLIBS  += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS))
-  ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static)
-    LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS))
-    LLVM_LDLIBS += -lstdc++
-  endif
+  LLVM_LDLIBS  += $(shell $(LLVM_CONFIG) --link-static --libs $(LLVM_CONFIG_LIB_COMPONENTS))
+  LLVM_LDLIBS  += $(shell $(LLVM_CONFIG) --link-static --system-libs $(LLVM_CONFIG_LIB_COMPONENTS))
+  LLVM_LDLIBS  += -lstdc++
   LLVM_LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags)
 endif
 
-- 
cgit v1.2.3


From 0c3fc330be6d85febd0f68b66c657c752e9cd63c Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:13:55 +0000
Subject: selftests/bpf: Limit URLS parsing logic to actual scope in vmtest

The URLS array is only valid in the download_rootfs function and does
not need to be parsed globally in advance. At the same time, the logic
of loading rootfs is refactored to prepare vmtest for supporting local
rootfs.

Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-5-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/vmtest.sh | 39 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 65d14f3bbe30..87d93f29c565 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -92,19 +92,6 @@ populate_url_map()
 	fi
 }
 
-download()
-{
-	local file="$1"
-
-	if [[ ! -v URLS[$file] ]]; then
-		echo "$file not found" >&2
-		return 1
-	fi
-
-	echo "Downloading $file..." >&2
-	curl -Lsf "${URLS[$file]}" "${@:2}"
-}
-
 newest_rootfs_version()
 {
 	{
@@ -118,16 +105,30 @@ newest_rootfs_version()
 
 download_rootfs()
 {
-	local rootfsversion="$1"
-	local dir="$2"
+	populate_url_map
+
+	local rootfsversion="$(newest_rootfs_version)"
+	local file="${ARCH}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst"
+
+	if [[ ! -v URLS[$file] ]]; then
+		echo "$file not found" >&2
+		return 1
+	fi
+
+	echo "Downloading $file..." >&2
+	curl -Lsf "${URLS[$file]}" "${@:2}"
+}
+
+load_rootfs()
+{
+	local dir="$1"
 
 	if ! which zstd &> /dev/null; then
 		echo 'Could not find "zstd" on the system, please install zstd'
 		exit 1
 	fi
 
-	download "${ARCH}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst" |
-		zstd -d | sudo tar -C "$dir" -x
+	download_rootfs | zstd -d | sudo tar -C "$dir" -x
 }
 
 recompile_kernel()
@@ -227,7 +228,7 @@ create_vm_image()
 	mkfs.ext4 -q "${rootfs_img}"
 
 	mount_image
-	download_rootfs "$(newest_rootfs_version)" "${mount_dir}"
+	load_rootfs "${mount_dir}"
 	unmount_image
 }
 
@@ -402,8 +403,6 @@ main()
 		make_command="${make_command} KBUILD_OUTPUT=${KBUILD_OUTPUT}"
 	fi
 
-	populate_url_map
-
 	local rootfs_img="${OUTPUT_DIR}/${ROOTFS_IMAGE}"
 	local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}"
 
-- 
cgit v1.2.3


From 2294073dce32af8535e9d4a32e1efdeea35c1786 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:13:56 +0000
Subject: selftests/bpf: Support local rootfs image for vmtest

Support vmtest to use local rootfs image generated by [0] that is
consistent with BPF CI. Now we can specify the local rootfs image
through the `-l` parameter like as follows:

  vmtest.sh -l ./libbpf-vmtest-rootfs-2024.08.22-noble-amd64.tar.zst -- ./test_progs

Meanwhile, some descriptions have been flushed.

Link: https://github.com/libbpf/ci/blob/main/rootfs/mkrootfs_debian.sh [0]
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-6-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/README.rst |  2 --
 tools/testing/selftests/bpf/vmtest.sh  | 21 ++++++++++++++++-----
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 9b974e425af3..4a1e74b17109 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -85,8 +85,6 @@ In case of linker errors when running selftests, try using static linking:
           If you want to change pahole and llvm, you can change `PATH` environment
           variable in the beginning of script.
 
-.. note:: The script currently only supports x86_64 and s390x architectures.
-
 Additional information about selftest failures are
 documented here.
 
diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 87d93f29c565..7bd2b44deb08 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -4,9 +4,11 @@
 set -u
 set -e
 
-# This script currently only works for x86_64 and s390x, as
-# it is based on the VM image used by the BPF CI, which is
-# available only for these architectures.
+# This script currently only works for the following platforms,
+# as it is based on the VM image used by the BPF CI, which is
+# available only for these architectures. We can also specify
+# the local rootfs image generated by the following script:
+# https://github.com/libbpf/ci/blob/main/rootfs/mkrootfs_debian.sh
 ARCH="$(uname -m)"
 case "${ARCH}" in
 s390x)
@@ -34,6 +36,7 @@ aarch64)
 esac
 DEFAULT_COMMAND="./test_progs"
 MOUNT_DIR="mnt"
+LOCAL_ROOTFS_IMAGE=""
 ROOTFS_IMAGE="root.img"
 OUTPUT_DIR="$HOME/.bpf_selftests"
 KCONFIG_REL_PATHS=("tools/testing/selftests/bpf/config"
@@ -69,6 +72,7 @@ or
 
 Options:
 
+	-l)             Specify the path to the local rootfs image.
 	-i)		Update the rootfs image with a newer version.
 	-d)		Update the output directory (default: ${OUTPUT_DIR})
 	-j)		Number of jobs for compilation, similar to -j in make
@@ -128,7 +132,11 @@ load_rootfs()
 		exit 1
 	fi
 
-	download_rootfs | zstd -d | sudo tar -C "$dir" -x
+	if [[ -n "${LOCAL_ROOTFS_IMAGE}" ]]; then
+		cat "${LOCAL_ROOTFS_IMAGE}" | zstd -d | sudo tar -C "$dir" -x
+	else
+		download_rootfs | zstd -d | sudo tar -C "$dir" -x
+	fi
 }
 
 recompile_kernel()
@@ -342,8 +350,11 @@ main()
 	local exit_command="poweroff -f"
 	local debug_shell="no"
 
-	while getopts ':hskid:j:' opt; do
+	while getopts ':hskl:id:j:' opt; do
 		case ${opt} in
+		l)
+			LOCAL_ROOTFS_IMAGE="$OPTARG"
+			;;
 		i)
 			update_image="yes"
 			;;
-- 
cgit v1.2.3


From d95d56519026322b25ffa964bb92182620176972 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:13:57 +0000
Subject: selftests/bpf: Enable cross platform testing for vmtest

Add support cross platform testing for vmtest. The variable $ARCH in the
current script is platform semantics, not kernel semantics. Rename it to
$PLATFORM so that we can easily use $ARCH in cross-compilation. And drop
`set -u` unbound variable check as we will use CROSS_COMPILE env
variable. For now, Using PLATFORM= and CROSS_COMPILE= options will
enable cross platform testing:

  PLATFORM=<platform> CROSS_COMPILE=<toolchain> vmtest.sh -- ./test_progs

Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-7-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/vmtest.sh | 42 ++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 7bd2b44deb08..91f940e8ce45 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-set -u
 set -e
 
 # This script currently only works for the following platforms,
@@ -9,25 +8,31 @@ set -e
 # available only for these architectures. We can also specify
 # the local rootfs image generated by the following script:
 # https://github.com/libbpf/ci/blob/main/rootfs/mkrootfs_debian.sh
-ARCH="$(uname -m)"
-case "${ARCH}" in
+PLATFORM="${PLATFORM:-$(uname -m)}"
+case "${PLATFORM}" in
 s390x)
 	QEMU_BINARY=qemu-system-s390x
 	QEMU_CONSOLE="ttyS1"
-	QEMU_FLAGS=(-smp 2)
+	HOST_FLAGS=(-smp 2 -enable-kvm)
+	CROSS_FLAGS=(-smp 2)
 	BZIMAGE="arch/s390/boot/vmlinux"
+	ARCH="s390"
 	;;
 x86_64)
 	QEMU_BINARY=qemu-system-x86_64
 	QEMU_CONSOLE="ttyS0,115200"
-	QEMU_FLAGS=(-cpu host -smp 8)
+	HOST_FLAGS=(-cpu host -enable-kvm -smp 8)
+	CROSS_FLAGS=(-smp 8)
 	BZIMAGE="arch/x86/boot/bzImage"
+	ARCH="x86"
 	;;
 aarch64)
 	QEMU_BINARY=qemu-system-aarch64
 	QEMU_CONSOLE="ttyAMA0,115200"
-	QEMU_FLAGS=(-M virt,gic-version=3 -cpu host -smp 8)
+	HOST_FLAGS=(-M virt,gic-version=3 -cpu host -enable-kvm -smp 8)
+	CROSS_FLAGS=(-M virt,gic-version=3 -cpu cortex-a76 -smp 8)
 	BZIMAGE="arch/arm64/boot/Image"
+	ARCH="arm64"
 	;;
 *)
 	echo "Unsupported architecture"
@@ -41,7 +46,7 @@ ROOTFS_IMAGE="root.img"
 OUTPUT_DIR="$HOME/.bpf_selftests"
 KCONFIG_REL_PATHS=("tools/testing/selftests/bpf/config"
 	"tools/testing/selftests/bpf/config.vm"
-	"tools/testing/selftests/bpf/config.${ARCH}")
+	"tools/testing/selftests/bpf/config.${PLATFORM}")
 INDEX_URL="https://raw.githubusercontent.com/libbpf/ci/master/INDEX"
 NUM_COMPILE_JOBS="$(nproc)"
 LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")"
@@ -61,6 +66,10 @@ tools/testing/selftests/bpf. e.g:
 If no command is specified and a debug shell (-s) is not requested,
 "${DEFAULT_COMMAND}" will be run by default.
 
+Using PLATFORM= and CROSS_COMPILE= options will enable cross platform testing:
+
+  PLATFORM=<platform> CROSS_COMPILE=<toolchain> $0 -- ./test_progs -t test_lsm
+
 If you build your kernel using KBUILD_OUTPUT= or O= options, these
 can be passed as environment variables to the script:
 
@@ -100,7 +109,7 @@ newest_rootfs_version()
 {
 	{
 	for file in "${!URLS[@]}"; do
-		if [[ $file =~ ^"${ARCH}"/libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then
+		if [[ $file =~ ^"${PLATFORM}"/libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then
 			echo "${BASH_REMATCH[1]}"
 		fi
 	done
@@ -112,7 +121,7 @@ download_rootfs()
 	populate_url_map
 
 	local rootfsversion="$(newest_rootfs_version)"
-	local file="${ARCH}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst"
+	local file="${PLATFORM}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst"
 
 	if [[ ! -v URLS[$file] ]]; then
 		echo "$file not found" >&2
@@ -253,12 +262,17 @@ EOF
 		exit 1
 	fi
 
+	if [[ "${PLATFORM}" != "$(uname -m)" ]]; then
+		QEMU_FLAGS=("${CROSS_FLAGS[@]}")
+	else
+		QEMU_FLAGS=("${HOST_FLAGS[@]}")
+	fi
+
 	${QEMU_BINARY} \
 		-nodefaults \
 		-display none \
 		-serial mon:stdio \
 		"${QEMU_FLAGS[@]}" \
-		-enable-kvm \
 		-m 4G \
 		-drive file="${rootfs_img}",format=raw,index=1,media=disk,if=virtio,cache=none \
 		-kernel "${kernel_bzimage}" \
@@ -389,6 +403,11 @@ main()
 
 	trap 'catch "$?"' EXIT
 
+	if [[ "${PLATFORM}" != "$(uname -m)" ]] && [[ -z "${CROSS_COMPILE}" ]]; then
+		echo "Cross-platform testing needs to specify CROSS_COMPILE"
+		exit 1
+	fi
+
 	if [[ $# -eq 0  && "${debug_shell}" == "no" ]]; then
 		echo "No command specified, will run ${DEFAULT_COMMAND} in the vm"
 	else
@@ -396,7 +415,8 @@ main()
 	fi
 
 	local kconfig_file="${OUTPUT_DIR}/latest.config"
-	local make_command="make -j ${NUM_COMPILE_JOBS} KCONFIG_CONFIG=${kconfig_file}"
+	local make_command="make ARCH=${ARCH} CROSS_COMPILE=${CROSS_COMPILE} \
+			    -j ${NUM_COMPILE_JOBS} KCONFIG_CONFIG=${kconfig_file}"
 
 	# Figure out where the kernel is being built.
 	# O takes precedence over KBUILD_OUTPUT.
-- 
cgit v1.2.3


From 897b3680484b8163c9eb51ed55103504ffd4a7d1 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:13:58 +0000
Subject: selftests/bpf: Add config.riscv64

Add config.riscv64 for both BPF CI and local vmtest.

Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-8-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/config.riscv64 | 84 ++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/config.riscv64

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/config.riscv64 b/tools/testing/selftests/bpf/config.riscv64
new file mode 100644
index 000000000000..bb7043a80e1a
--- /dev/null
+++ b/tools/testing/selftests/bpf/config.riscv64
@@ -0,0 +1,84 @@
+CONFIG_AUDIT=y
+CONFIG_BLK_CGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BONDING=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_PRELOAD=y
+CONFIG_BPF_PRELOAD_UMD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_HUGETLB=y
+CONFIG_CGROUP_NET_CLASSID=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CPUSETS=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_DEBUG_FS=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_EXPERT=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FRAME_POINTER=y
+CONFIG_HARDLOCKUP_DETECTOR=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_HUGETLBFS=y
+CONFIG_INET=y
+CONFIG_IPV6_SEG6_LWTUNNEL=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KPROBES=y
+CONFIG_MEMCG=y
+CONFIG_NAMESPACES=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER_XT_MATCH_BPF=y
+CONFIG_NET_ACT_BPF=y
+CONFIG_NET_L3_MASTER_DEV=y
+CONFIG_NET_VRF=y
+CONFIG_NONPORTABLE=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NR_CPUS=256
+CONFIG_PACKET=y
+CONFIG_PANIC_ON_OOPS=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_PCI=y
+CONFIG_PCI_HOST_GENERIC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_PRINTK_TIME=y
+CONFIG_PROC_KCORE=y
+CONFIG_PROFILING=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
+CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS=y
+CONFIG_RISCV_ISA_C=y
+CONFIG_RISCV_PMU=y
+CONFIG_RISCV_PMU_SBI=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
+CONFIG_SMP=y
+CONFIG_SOC_VIRT=y
+CONFIG_SYSVIPC=y
+CONFIG_TCP_CONG_ADVANCED=y
+CONFIG_TLS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_UPROBES=y
+CONFIG_USER_NS=y
+CONFIG_VETH=y
+CONFIG_VLAN_8021Q=y
+CONFIG_VSOCKETS_LOOPBACK=y
+CONFIG_XFRM_USER=y
-- 
cgit v1.2.3


From c402cb85802fa059f1d42954f25acf14c3bf89e7 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:13:59 +0000
Subject: selftests/bpf: Add DENYLIST.riscv64

This patch adds DENYLIST.riscv64 file for riscv64. It will help BPF CI
and local vmtest to mask failing and unsupported test cases.

We can use the following command to use deny list in local vmtest as
previously mentioned by Manu.

PLATFORM=riscv64 CROSS_COMPILE=riscv64-linux-gnu- vmtest.sh \
    -l ./libbpf-vmtest-rootfs-2024.08.30-noble-riscv64.tar.zst -- \
    ./test_progs -d \
        \"$(cat tools/testing/selftests/bpf/DENYLIST.riscv64 \
            | cut -d'#' -f1 \
            | sed -e 's/^[[:space:]]*//' \
                  -e 's/[[:space:]]*$//' \
            | tr -s '\n' ','\
        )\"

Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-9-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.riscv64 | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/DENYLIST.riscv64

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/DENYLIST.riscv64 b/tools/testing/selftests/bpf/DENYLIST.riscv64
new file mode 100644
index 000000000000..4fc4dfdde293
--- /dev/null
+++ b/tools/testing/selftests/bpf/DENYLIST.riscv64
@@ -0,0 +1,3 @@
+# riscv64 deny list for BPF CI and local vmtest
+exceptions					# JIT does not support exceptions
+tailcalls/tailcall_bpf2bpf*			# JIT does not support mixing bpf2bpf and tailcalls
-- 
cgit v1.2.3


From b2bc9d50549939e3e103f986f3123970a6b81e44 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:14:00 +0000
Subject: selftests/bpf: Add riscv64 configurations to local vmtest

Add riscv64 configurations to local vmtest.

We can now perform cross platform testing for riscv64 bpf using the
following command:

PLATFORM=riscv64 CROSS_COMPILE=riscv64-linux-gnu- vmtest.sh \
    -l ./libbpf-vmtest-rootfs-2024.08.30-noble-riscv64.tar.zst -- \
    ./test_progs -d \
        \"$(cat tools/testing/selftests/bpf/DENYLIST.riscv64 \
            | cut -d'#' -f1 \
            | sed -e 's/^[[:space:]]*//' \
                  -e 's/[[:space:]]*$//' \
            | tr -s '\n' ','\
        )\"

Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-10-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/vmtest.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 91f940e8ce45..79505d294c44 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -34,6 +34,15 @@ aarch64)
 	BZIMAGE="arch/arm64/boot/Image"
 	ARCH="arm64"
 	;;
+riscv64)
+	# required qemu version v7.2.0+
+	QEMU_BINARY=qemu-system-riscv64
+	QEMU_CONSOLE="ttyS0,115200"
+	HOST_FLAGS=(-M virt -cpu host -enable-kvm -smp 8)
+	CROSS_FLAGS=(-M virt -cpu rv64,sscofpmf=true -smp 8)
+	BZIMAGE="arch/riscv/boot/Image"
+	ARCH="riscv"
+	;;
 *)
 	echo "Unsupported architecture"
 	exit 1
-- 
cgit v1.2.3


From 95b1c5d17832f61d0438cd5060a997bafaf75f2c Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 5 Sep 2024 08:14:01 +0000
Subject: selftests/bpf: Add description for running vmtest on RV64

Add description in tools/testing/selftests/bpf/README.rst
for running vmtest on RV64.

Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20240905081401.1894789-11-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/README.rst | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 4a1e74b17109..776fbe3cb8f9 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -85,6 +85,38 @@ In case of linker errors when running selftests, try using static linking:
           If you want to change pahole and llvm, you can change `PATH` environment
           variable in the beginning of script.
 
+Running vmtest on RV64
+======================
+To speed up testing and avoid various dependency issues, it is recommended to
+run vmtest in a Docker container. Before running vmtest, we need to prepare
+Docker container and local rootfs image. The overall steps are as follows:
+
+1. Create Docker container as shown in link [0].
+
+2. Use mkrootfs_debian.sh script [1] to build local rootfs image:
+
+.. code-block:: console
+
+  $ sudo ./mkrootfs_debian.sh --arch riscv64 --distro noble
+
+3. Start Docker container [0] and run vmtest in the container:
+
+.. code-block:: console
+
+  $ PLATFORM=riscv64 CROSS_COMPILE=riscv64-linux-gnu- \
+    tools/testing/selftests/bpf/vmtest.sh \
+    -l <path of local rootfs image> -- \
+    ./test_progs -d \
+        \"$(cat tools/testing/selftests/bpf/DENYLIST.riscv64 \
+            | cut -d'#' -f1 \
+            | sed -e 's/^[[:space:]]*//' \
+                  -e 's/[[:space:]]*$//' \
+            | tr -s '\n' ',' \
+        )\"
+
+Link: https://github.com/pulehui/riscv-bpf-vmtest.git [0]
+Link: https://github.com/libbpf/ci/blob/main/rootfs/mkrootfs_debian.sh [1]
+
 Additional information about selftest failures are
 documented here.
 
-- 
cgit v1.2.3


From 1b3bc648f5067d32959d7389a7c17c8c04e8e689 Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Thu, 5 Sep 2024 15:38:12 -0700
Subject: bpf/selftests: coverage for tp and perf event progs using kfuncs

This coverage ensures that kfuncs are allowed within tracepoint and perf
event programs.

Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20240905223812.141857-3-inwardvessel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../bpf/progs/verifier_kfunc_prog_types.c          | 48 ++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c
index cb32b0cfc84b..a509cad97e69 100644
--- a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c
+++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c
@@ -47,6 +47,22 @@ int BPF_PROG(task_kfunc_syscall)
 	return 0;
 }
 
+SEC("tracepoint")
+__success
+int BPF_PROG(task_kfunc_tracepoint)
+{
+	task_kfunc_load_test();
+	return 0;
+}
+
+SEC("perf_event")
+__success
+int BPF_PROG(task_kfunc_perf_event)
+{
+	task_kfunc_load_test();
+	return 0;
+}
+
 /*****************
  * cgroup kfuncs *
  *****************/
@@ -85,6 +101,22 @@ int BPF_PROG(cgrp_kfunc_syscall)
 	return 0;
 }
 
+SEC("tracepoint")
+__success
+int BPF_PROG(cgrp_kfunc_tracepoint)
+{
+	cgrp_kfunc_load_test();
+	return 0;
+}
+
+SEC("perf_event")
+__success
+int BPF_PROG(cgrp_kfunc_perf_event)
+{
+	cgrp_kfunc_load_test();
+	return 0;
+}
+
 /******************
  * cpumask kfuncs *
  ******************/
@@ -120,3 +152,19 @@ int BPF_PROG(cpumask_kfunc_syscall)
 	cpumask_kfunc_load_test();
 	return 0;
 }
+
+SEC("tracepoint")
+__success
+int BPF_PROG(cpumask_kfunc_tracepoint)
+{
+	cpumask_kfunc_load_test();
+	return 0;
+}
+
+SEC("perf_event")
+__success
+int BPF_PROG(cpumask_kfunc_perf_event)
+{
+	cpumask_kfunc_load_test();
+	return 0;
+}
-- 
cgit v1.2.3


From 4b80294fb53845dc5c98cca0c989da09150f2ca9 Mon Sep 17 00:00:00 2001
From: "John B. Wyatt IV" <jwyatt@redhat.com>
Date: Wed, 4 Sep 2024 22:19:08 -0400
Subject: pm:cpupower: Add missing powercap_set_enabled() stub function

There was a symbol listed in the powercap.h file that was not implemented.
Implement it with a stub return of 0.

Programs like SWIG require that functions that are defined in the
headers be implemented.

Fixes: c2294c1496b7 ("cpupower: Introduce powercap intel-rapl library and powercap-info command")
Suggested-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: John B. Wyatt IV <jwyatt@redhat.com>
Signed-off-by: John B. Wyatt IV <sageofredondo@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/lib/powercap.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/lib/powercap.c b/tools/power/cpupower/lib/powercap.c
index a7a59c6bacda..94a0c69e55ef 100644
--- a/tools/power/cpupower/lib/powercap.c
+++ b/tools/power/cpupower/lib/powercap.c
@@ -77,6 +77,14 @@ int powercap_get_enabled(int *mode)
 	return sysfs_get_enabled(path, mode);
 }
 
+/*
+ * TODO: implement function. Returns dummy 0 for now.
+ */
+int powercap_set_enabled(int mode)
+{
+	return 0;
+}
+
 /*
  * Hardcoded, because rapl is the only powercap implementation
 - * this needs to get more generic if more powercap implementations
-- 
cgit v1.2.3


From 338f490e07bc9e83c6fe60125ed9dea3e5d6a45d Mon Sep 17 00:00:00 2001
From: "John B. Wyatt IV" <jwyatt@redhat.com>
Date: Wed, 4 Sep 2024 22:19:09 -0400
Subject: pm:cpupower: Add SWIG bindings files for libcpupower

SWIG is a tool packaged in Fedora and other distros that can generate
bindings from C and C++ code for several languages including Python,
Perl, and Go.

These bindings allows users to easily write scripts that use and extend
libcpupower's functionality. Currently, only Python is provided in the
makefile, but additional languages may be added if there is demand.

Added suggestions from Shuah Khan for the README and license discussion.

Note that while SWIG itself is GPL v3+ licensed; the resulting output,
the bindings code, is permissively licensed + the license of the .o
files. Please see
https://swig.org/legal.html and [1] for more details.

[1]
https://lore.kernel.org/linux-pm/Zqv9BOjxLAgyNP5B@hatbackup/

Suggested-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: John B. Wyatt IV <jwyatt@redhat.com>
Signed-off-by: John B. Wyatt IV <sageofredondo@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/bindings/python/.gitignore    |   8 +
 tools/power/cpupower/bindings/python/Makefile      |  31 +++
 tools/power/cpupower/bindings/python/README        |  59 +++++
 .../cpupower/bindings/python/raw_pylibcpupower.i   | 247 +++++++++++++++++++++
 4 files changed, 345 insertions(+)
 create mode 100644 tools/power/cpupower/bindings/python/.gitignore
 create mode 100644 tools/power/cpupower/bindings/python/Makefile
 create mode 100644 tools/power/cpupower/bindings/python/README
 create mode 100644 tools/power/cpupower/bindings/python/raw_pylibcpupower.i

(limited to 'tools')

diff --git a/tools/power/cpupower/bindings/python/.gitignore b/tools/power/cpupower/bindings/python/.gitignore
new file mode 100644
index 000000000000..5c9a1f0212dd
--- /dev/null
+++ b/tools/power/cpupower/bindings/python/.gitignore
@@ -0,0 +1,8 @@
+__pycache__/
+raw_pylibcpupower_wrap.c
+*.o
+*.so
+*.py
+!test_raw_pylibcpupower.py
+# git keeps ignoring this file, use git add -f raw_libcpupower.i
+!raw_pylibcpupower.i
diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile
new file mode 100644
index 000000000000..d0418f902795
--- /dev/null
+++ b/tools/power/cpupower/bindings/python/Makefile
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for libcpupower's Python bindings
+#
+# This Makefile expects you have already run the makefile for cpupower to build
+# the .o files in the lib directory for the bindings to be created.
+
+CC=gcc
+
+LIB_DIR = ../../lib
+BIND_DIR = .
+PY_INCLUDE := $(firstword $(shell python-config --includes))
+#PY_INCLUDE = $(shell python-config --includes | awk '{ print $1 }')
+
+OBJECTS_LIB = $(wildcard $(LIB_DIR)/*.o)
+OBJECTS_BIND = $(wildcard $(BIND_DIR)/*.o)
+
+all: _raw_pylibcpupower.so
+
+_raw_pylibcpupower.so: raw_pylibcpupower_wrap.o
+	$(CC) -shared $(OBJECTS_LIB) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so # raw_pylibcpupower_wrap.o
+#	$(CC) -shared $(OBJECTS_BIND) $(OBJECTS_LIB) -o _raw_pylibcpupower.so # raw_pylibcpupower_wrap.o
+
+raw_pylibcpupower_wrap.o: raw_pylibcpupower_wrap.c
+	$(CC) -fPIC -c raw_pylibcpupower_wrap.c $(PY_INCLUDE)
+
+raw_pylibcpupower_wrap.c: raw_pylibcpupower.i
+	swig -python raw_pylibcpupower.i
+
+# Will only clean the bindings folder; will not clean the actual cpupower folder
+clean:
+	rm -f raw_pylibcpupower.py raw_pylibcpupower_wrap.c raw_pylibcpupower_wrap.o _raw_pylibcpupower.so
diff --git a/tools/power/cpupower/bindings/python/README b/tools/power/cpupower/bindings/python/README
new file mode 100644
index 000000000000..0a4bb2581e8a
--- /dev/null
+++ b/tools/power/cpupower/bindings/python/README
@@ -0,0 +1,59 @@
+This folder contains the necessary files to build the Python bindings for
+libcpupower (aside from the libcpupower object files).
+
+
+requirements
+------------
+
+* You need the object files in the libcpupower directory compiled by
+cpupower's makefile.
+* The SWIG program must be installed.
+* The Python's development libraries installed.
+
+Please check that your version of SWIG is compatible with the version of Python
+installed on your machine by checking the SWIG changelog on their website.
+https://swig.org/
+
+Note that while SWIG itself is GPL v3+ licensed; the resulting output,
+the bindings code: is permissively licensed + the license of libcpupower's .o
+files. For these bindings that means GPL v2.
+
+Please see https://swig.org/legal.html and the discussion [1] for more details.
+
+[1]
+https://lore.kernel.org/linux-pm/Zqv9BOjxLAgyNP5B@hatbackup/
+
+
+build
+-----
+
+Install SWIG and the Python development files provided by your distribution.
+
+Build the object files for libcpupower by running make in the cpupower
+directory.
+
+Return to the directory this README is in to run:
+
+$ make
+
+
+testing
+-------
+
+Please verify the _raw_pylibcpupower.so and raw_pylibcpupower.py files have
+been created.
+
+To run the test script:
+
+$ python test_raw_pylibcpupower.py
+
+
+credits
+-------
+
+Original Bindings Author:
+John B. Wyatt IV
+jwyatt@redhat.com
+sageofredondo@gmail.com
+
+Copyright (C) 2024 Red Hat
diff --git a/tools/power/cpupower/bindings/python/raw_pylibcpupower.i b/tools/power/cpupower/bindings/python/raw_pylibcpupower.i
new file mode 100644
index 000000000000..96556d87a745
--- /dev/null
+++ b/tools/power/cpupower/bindings/python/raw_pylibcpupower.i
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+%module raw_pylibcpupower
+%{
+#include "../../lib/cpupower_intern.h"
+#include "../../lib/acpi_cppc.h"
+#include "../../lib/cpufreq.h"
+#include "../../lib/cpuidle.h"
+#include "../../lib/cpupower.h"
+#include "../../lib/powercap.h"
+%}
+
+/*
+ * cpupower_intern.h
+ */
+
+#define PATH_TO_CPU "/sys/devices/system/cpu/"
+#define MAX_LINE_LEN 4096
+#define SYSFS_PATH_MAX 255
+
+int is_valid_path(const char *path);
+
+unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen);
+
+unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen);
+
+/*
+ * acpi_cppc.h
+ */
+
+enum acpi_cppc_value {
+	HIGHEST_PERF,
+	LOWEST_PERF,
+	NOMINAL_PERF,
+	LOWEST_NONLINEAR_PERF,
+	LOWEST_FREQ,
+	NOMINAL_FREQ,
+	REFERENCE_PERF,
+	WRAPAROUND_TIME,
+	MAX_CPPC_VALUE_FILES
+};
+
+unsigned long acpi_cppc_get_data(unsigned int cpu,
+				 enum acpi_cppc_value which);
+
+/*
+ * cpufreq.h
+ */
+
+struct cpufreq_policy {
+	unsigned long min;
+	unsigned long max;
+	char *governor;
+};
+
+struct cpufreq_available_governors {
+	char *governor;
+	struct cpufreq_available_governors *next;
+	struct cpufreq_available_governors *first;
+};
+
+struct cpufreq_available_frequencies {
+	unsigned long frequency;
+	struct cpufreq_available_frequencies *next;
+	struct cpufreq_available_frequencies *first;
+};
+
+
+struct cpufreq_affected_cpus {
+	unsigned int cpu;
+	struct cpufreq_affected_cpus *next;
+	struct cpufreq_affected_cpus *first;
+};
+
+struct cpufreq_stats {
+	unsigned long frequency;
+	unsigned long long time_in_state;
+	struct cpufreq_stats *next;
+	struct cpufreq_stats *first;
+};
+
+unsigned long cpufreq_get_freq_kernel(unsigned int cpu);
+
+unsigned long cpufreq_get_freq_hardware(unsigned int cpu);
+
+#define cpufreq_get(cpu) cpufreq_get_freq_kernel(cpu);
+
+unsigned long cpufreq_get_transition_latency(unsigned int cpu);
+
+int cpufreq_get_hardware_limits(unsigned int cpu,
+				unsigned long *min,
+				unsigned long *max);
+
+char *cpufreq_get_driver(unsigned int cpu);
+
+void cpufreq_put_driver(char *ptr);
+
+struct cpufreq_policy *cpufreq_get_policy(unsigned int cpu);
+
+void cpufreq_put_policy(struct cpufreq_policy *policy);
+
+struct cpufreq_available_governors
+*cpufreq_get_available_governors(unsigned int cpu);
+
+void cpufreq_put_available_governors(
+	struct cpufreq_available_governors *first);
+
+struct cpufreq_available_frequencies
+*cpufreq_get_available_frequencies(unsigned int cpu);
+
+void cpufreq_put_available_frequencies(
+		struct cpufreq_available_frequencies *first);
+
+struct cpufreq_available_frequencies
+*cpufreq_get_boost_frequencies(unsigned int cpu);
+
+void cpufreq_put_boost_frequencies(
+		struct cpufreq_available_frequencies *first);
+
+struct cpufreq_affected_cpus *cpufreq_get_affected_cpus(unsigned
+							int cpu);
+
+void cpufreq_put_affected_cpus(struct cpufreq_affected_cpus *first);
+
+struct cpufreq_affected_cpus *cpufreq_get_related_cpus(unsigned
+							int cpu);
+
+void cpufreq_put_related_cpus(struct cpufreq_affected_cpus *first);
+
+struct cpufreq_stats *cpufreq_get_stats(unsigned int cpu,
+					unsigned long long *total_time);
+
+void cpufreq_put_stats(struct cpufreq_stats *stats);
+
+unsigned long cpufreq_get_transitions(unsigned int cpu);
+
+int cpufreq_set_policy(unsigned int cpu, struct cpufreq_policy *policy);
+
+int cpufreq_modify_policy_min(unsigned int cpu, unsigned long min_freq);
+
+int cpufreq_modify_policy_max(unsigned int cpu, unsigned long max_freq);
+
+int cpufreq_modify_policy_governor(unsigned int cpu, char *governor);
+
+int cpufreq_set_frequency(unsigned int cpu,
+				unsigned long target_frequency);
+
+unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu,
+						 const char **table,
+						 unsigned int index,
+						 unsigned int size);
+
+/*
+ * cpuidle.h
+ */
+
+int cpuidle_is_state_disabled(unsigned int cpu,
+				       unsigned int idlestate);
+int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate,
+				   unsigned int disable);
+unsigned long cpuidle_state_latency(unsigned int cpu,
+						unsigned int idlestate);
+unsigned long cpuidle_state_usage(unsigned int cpu,
+					unsigned int idlestate);
+unsigned long long cpuidle_state_time(unsigned int cpu,
+						unsigned int idlestate);
+char *cpuidle_state_name(unsigned int cpu,
+				unsigned int idlestate);
+char *cpuidle_state_desc(unsigned int cpu,
+				unsigned int idlestate);
+unsigned int cpuidle_state_count(unsigned int cpu);
+
+char *cpuidle_get_governor(void);
+
+char *cpuidle_get_driver(void);
+
+/*
+ * cpupower.h
+ */
+
+struct cpupower_topology {
+	/* Amount of CPU cores, packages and threads per core in the system */
+	unsigned int cores;
+	unsigned int pkgs;
+	unsigned int threads; /* per core */
+
+	/* Array gets mallocated with cores entries, holding per core info */
+	struct cpuid_core_info *core_info;
+};
+
+struct cpuid_core_info {
+	int pkg;
+	int core;
+	int cpu;
+
+	/* flags */
+	unsigned int is_online:1;
+};
+
+int get_cpu_topology(struct cpupower_topology *cpu_top);
+
+void cpu_topology_release(struct cpupower_topology cpu_top);
+
+int cpupower_is_cpu_online(unsigned int cpu);
+
+/*
+ * powercap.h
+ */
+
+struct powercap_zone {
+	char name[MAX_LINE_LEN];
+	/*
+	 * sys_name relative to PATH_TO_POWERCAP,
+	 * do not forget the / in between
+	 */
+	char sys_name[SYSFS_PATH_MAX];
+	int tree_depth;
+	struct powercap_zone *parent;
+	struct powercap_zone *children[POWERCAP_MAX_CHILD_ZONES];
+	/* More possible caps or attributes to be added? */
+	uint32_t has_power_uw:1,
+		 has_energy_uj:1;
+
+};
+
+int powercap_walk_zones(struct powercap_zone *zone,
+			int (*f)(struct powercap_zone *zone));
+
+struct powercap_zone *powercap_init_zones(void);
+
+int powercap_get_enabled(int *mode);
+
+int powercap_set_enabled(int mode);
+
+int powercap_get_driver(char *driver, int buflen);
+
+int powercap_get_max_energy_range_uj(struct powercap_zone *zone, uint64_t *val);
+
+int powercap_get_energy_uj(struct powercap_zone *zone, uint64_t *val);
+
+int powercap_get_max_power_range_uw(struct powercap_zone *zone, uint64_t *val);
+
+int powercap_get_power_uw(struct powercap_zone *zone, uint64_t *val);
+
+int powercap_zone_get_enabled(struct powercap_zone *zone, int *mode);
+
+int powercap_zone_set_enabled(struct powercap_zone *zone, int mode);
-- 
cgit v1.2.3


From 660475266b744ab02695c6f3cdf28b120b884125 Mon Sep 17 00:00:00 2001
From: "John B. Wyatt IV" <jwyatt@redhat.com>
Date: Wed, 4 Sep 2024 22:19:10 -0400
Subject: pm:cpupower: Include test_raw_pylibcpupower.py

This script demonstrates how to make use of, and tests, the bindings.

In the future, this script could become part of a larger test suite to
test the bindings and libcpupower.

Signed-off-by: John B. Wyatt IV <jwyatt@redhat.com>
Signed-off-by: John B. Wyatt IV <sageofredondo@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../bindings/python/test_raw_pylibcpupower.py      | 42 ++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100755 tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py

(limited to 'tools')

diff --git a/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py b/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py
new file mode 100755
index 000000000000..3d6f62b9556a
--- /dev/null
+++ b/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+
+import raw_pylibcpupower as p
+
+# Simple function call
+
+"""
+Get cstate count
+"""
+cpu_cstates_count = p.cpuidle_state_count(0)
+if cpu_cstates_count > -1:
+    print(f"CPU 0 has {cpu_cstates_count} c-states")
+else:
+    print(f"cstate count error: return code: {cpu_cstates_count}")
+
+"""
+Disable cstate (will fail if the above is 0, ex: a virtual machine)
+"""
+cstate_disabled = p.cpuidle_state_disable(0, 0, 1)
+if cpu_cstates_count == 0:
+    print(f"CPU 0 has {cpu_cstates_count} c-states")
+else:
+    print(f"cstate count error: return code: {cpu_cstates_count}")
+
+match cstate_disabled:
+    case 0:
+        print(f"CPU state disabled")
+    case -1:
+        print(f"Idlestate not available")
+    case _:
+        print(f"Not documented")
+
+
+# Pointer example
+
+topo = p.cpupower_topology()
+total_cpus = p.get_cpu_topology(topo)
+if total_cpus > 0:
+    print(f"Number of total cpus: {total_cpus} and number of cores: {topo.cores}")
+else:
+    print(f"Error: could not get cpu topology")
-- 
cgit v1.2.3


From f2dbc7790929b2e39a348a344311a8b70e6ed4b4 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 4 Sep 2024 14:17:05 -0700
Subject: perf jevents: Ignore sys when determining a model directory

Existing sys directories aren't placed under a model directory like
skylake.

Placing a sys directory there causes the `is_leaf_dir` test to fail and
consequently no events or metrics are generated for the model.

Ignore sys directories in this case and update the comments to
reflect why.

This change has no affect, but when testing with a sys directory for a
model people have reported running into the no event/metric issue.

Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Link: https://lore.kernel.org/r/20240904211705.915101-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/jevents.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py
index 1d96b2204e52..bb0a5d92df4a 100755
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -635,14 +635,17 @@ def preprocess_one_file(parents: Sequence[str], item: os.DirEntry) -> None:
 
 def process_one_file(parents: Sequence[str], item: os.DirEntry) -> None:
   """Process a JSON file during the main walk."""
-  def is_leaf_dir(path: str) -> bool:
+  def is_leaf_dir_ignoring_sys(path: str) -> bool:
     for item in os.scandir(path):
-      if item.is_dir():
+      if item.is_dir() and item.name != 'sys':
         return False
     return True
 
-  # model directory, reset topic
-  if item.is_dir() and is_leaf_dir(item.path):
+  # Model directories are leaves (ignoring possible sys
+  # directories). The FTW will walk into the directory next. Flush
+  # pending events and metrics and update the table names for the new
+  # model directory.
+  if item.is_dir() and is_leaf_dir_ignoring_sys(item.path):
     print_pending_events()
     print_pending_metrics()
 
-- 
cgit v1.2.3


From 4bef6168c145852cebefe2f0bd78f9aaeba52843 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 5 Sep 2024 08:07:55 -0700
Subject: perf script python: Avoid buffer overflow in python PEBS register
 interface

Running a script that processes PEBS records gives buffer overflows
in valgrind.

The problem is that the allocation of the register string doesn't
include the terminating 0 byte. Fix this.

I also replaced the very magic "28" with a more reasonable larger buffer
that should fit all registers.  There's no need to conserve memory here.

  ==2106591== Memcheck, a memory error detector
  ==2106591== Copyright (C) 2002-2022, and GNU GPL'd, by Julian Seward et al.
  ==2106591== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info
  ==2106591== Command: ../perf script -i tcall.data gcov.py tcall.gcov
  ==2106591==
  ==2106591== Invalid write of size 1
  ==2106591==    at 0x713354: regs_map (trace-event-python.c:748)
  ==2106591==    by 0x7134EB: set_regs_in_dict (trace-event-python.c:784)
  ==2106591==    by 0x713E58: get_perf_sample_dict (trace-event-python.c:940)
  ==2106591==    by 0x716327: python_process_general_event (trace-event-python.c:1499)
  ==2106591==    by 0x7164E1: python_process_event (trace-event-python.c:1531)
  ==2106591==    by 0x44F9AF: process_sample_event (builtin-script.c:2549)
  ==2106591==    by 0x6294DC: evlist__deliver_sample (session.c:1534)
  ==2106591==    by 0x6296D0: machines__deliver_event (session.c:1573)
  ==2106591==    by 0x629C39: perf_session__deliver_event (session.c:1655)
  ==2106591==    by 0x625830: ordered_events__deliver_event (session.c:193)
  ==2106591==    by 0x630B23: do_flush (ordered-events.c:245)
  ==2106591==    by 0x630E7A: __ordered_events__flush (ordered-events.c:324)
  ==2106591==  Address 0x7186fe0 is 0 bytes after a block of size 0 alloc'd
  ==2106591==    at 0x484280F: malloc (vg_replace_malloc.c:442)
  ==2106591==    by 0x7134AD: set_regs_in_dict (trace-event-python.c:780)
  ==2106591==    by 0x713E58: get_perf_sample_dict (trace-event-python.c:940)
  ==2106591==    by 0x716327: python_process_general_event (trace-event-python.c:1499)
  ==2106591==    by 0x7164E1: python_process_event (trace-event-python.c:1531)
  ==2106591==    by 0x44F9AF: process_sample_event (builtin-script.c:2549)
  ==2106591==    by 0x6294DC: evlist__deliver_sample (session.c:1534)
  ==2106591==    by 0x6296D0: machines__deliver_event (session.c:1573)
  ==2106591==    by 0x629C39: perf_session__deliver_event (session.c:1655)
  ==2106591==    by 0x625830: ordered_events__deliver_event (session.c:193)
  ==2106591==    by 0x630B23: do_flush (ordered-events.c:245)
  ==2106591==    by 0x630E7A: __ordered_events__flush (ordered-events.c:324)
  ==2106591==
  ==2106591== Invalid read of size 1
  ==2106591==    at 0x484B6C6: strlen (vg_replace_strmem.c:502)
  ==2106591==    by 0x555D494: PyUnicode_FromString (unicodeobject.c:1899)
  ==2106591==    by 0x7134F7: set_regs_in_dict (trace-event-python.c:786)
  ==2106591==    by 0x713E58: get_perf_sample_dict (trace-event-python.c:940)
  ==2106591==    by 0x716327: python_process_general_event (trace-event-python.c:1499)
  ==2106591==    by 0x7164E1: python_process_event (trace-event-python.c:1531)
  ==2106591==    by 0x44F9AF: process_sample_event (builtin-script.c:2549)
  ==2106591==    by 0x6294DC: evlist__deliver_sample (session.c:1534)
  ==2106591==    by 0x6296D0: machines__deliver_event (session.c:1573)
  ==2106591==    by 0x629C39: perf_session__deliver_event (session.c:1655)
  ==2106591==    by 0x625830: ordered_events__deliver_event (session.c:193)
  ==2106591==    by 0x630B23: do_flush (ordered-events.c:245)
  ==2106591==  Address 0x7186fe0 is 0 bytes after a block of size 0 alloc'd
  ==2106591==    at 0x484280F: malloc (vg_replace_malloc.c:442)
  ==2106591==    by 0x7134AD: set_regs_in_dict (trace-event-python.c:780)
  ==2106591==    by 0x713E58: get_perf_sample_dict (trace-event-python.c:940)
  ==2106591==    by 0x716327: python_process_general_event (trace-event-python.c:1499)
  ==2106591==    by 0x7164E1: python_process_event (trace-event-python.c:1531)
  ==2106591==    by 0x44F9AF: process_sample_event (builtin-script.c:2549)
  ==2106591==    by 0x6294DC: evlist__deliver_sample (session.c:1534)
  ==2106591==    by 0x6296D0: machines__deliver_event (session.c:1573)
  ==2106591==    by 0x629C39: perf_session__deliver_event (session.c:1655)
  ==2106591==    by 0x625830: ordered_events__deliver_event (session.c:193)
  ==2106591==    by 0x630B23: do_flush (ordered-events.c:245)
  ==2106591==    by 0x630E7A: __ordered_events__flush (ordered-events.c:324)
  ==2106591==
  ==2106591== Invalid write of size 1
  ==2106591==    at 0x713354: regs_map (trace-event-python.c:748)
  ==2106591==    by 0x713539: set_regs_in_dict (trace-event-python.c:789)
  ==2106591==    by 0x713E58: get_perf_sample_dict (trace-event-python.c:940)
  ==2106591==    by 0x716327: python_process_general_event (trace-event-python.c:1499)
  ==2106591==    by 0x7164E1: python_process_event (trace-event-python.c:1531)
  ==2106591==    by 0x44F9AF: process_sample_event (builtin-script.c:2549)
  ==2106591==    by 0x6294DC: evlist__deliver_sample (session.c:1534)
  ==2106591==    by 0x6296D0: machines__deliver_event (session.c:1573)
  ==2106591==    by 0x629C39: perf_session__deliver_event (session.c:1655)
  ==2106591==    by 0x625830: ordered_events__deliver_event (session.c:193)
  ==2106591==    by 0x630B23: do_flush (ordered-events.c:245)
  ==2106591==    by 0x630E7A: __ordered_events__flush (ordered-events.c:324)
  ==2106591==  Address 0x7186fe0 is 0 bytes after a block of size 0 alloc'd
  ==2106591==    at 0x484280F: malloc (vg_replace_malloc.c:442)
  ==2106591==    by 0x7134AD: set_regs_in_dict (trace-event-python.c:780)
  ==2106591==    by 0x713E58: get_perf_sample_dict (trace-event-python.c:940)
  ==2106591==    by 0x716327: python_process_general_event (trace-event-python.c:1499)
  ==2106591==    by 0x7164E1: python_process_event (trace-event-python.c:1531)
  ==2106591==    by 0x44F9AF: process_sample_event (builtin-script.c:2549)
  ==2106591==    by 0x6294DC: evlist__deliver_sample (session.c:1534)
  ==2106591==    by 0x6296D0: machines__deliver_event (session.c:1573)
  ==2106591==    by 0x629C39: perf_session__deliver_event (session.c:1655)
  ==2106591==    by 0x625830: ordered_events__deliver_event (session.c:193)
  ==2106591==    by 0x630B23: do_flush (ordered-events.c:245)
  ==2106591==    by 0x630E7A: __ordered_events__flush (ordered-events.c:324)
  ==2106591==
  ==2106591== Invalid read of size 1
  ==2106591==    at 0x484B6C6: strlen (vg_replace_strmem.c:502)
  ==2106591==    by 0x555D494: PyUnicode_FromString (unicodeobject.c:1899)
  ==2106591==    by 0x713545: set_regs_in_dict (trace-event-python.c:791)
  ==2106591==    by 0x713E58: get_perf_sample_dict (trace-event-python.c:940)
  ==2106591==    by 0x716327: python_process_general_event (trace-event-python.c:1499)
  ==2106591==    by 0x7164E1: python_process_event (trace-event-python.c:1531)
  ==2106591==    by 0x44F9AF: process_sample_event (builtin-script.c:2549)
  ==2106591==    by 0x6294DC: evlist__deliver_sample (session.c:1534)
  ==2106591==    by 0x6296D0: machines__deliver_event (session.c:1573)
  ==2106591==    by 0x629C39: perf_session__deliver_event (session.c:1655)
  ==2106591==    by 0x625830: ordered_events__deliver_event (session.c:193)
  ==2106591==    by 0x630B23: do_flush (ordered-events.c:245)
  ==2106591==  Address 0x7186fe0 is 0 bytes after a block of size 0 alloc'd
  ==2106591==    at 0x484280F: malloc (vg_replace_malloc.c:442)
  ==2106591==    by 0x7134AD: set_regs_in_dict (trace-event-python.c:780)
  ==2106591==    by 0x713E58: get_perf_sample_dict (trace-event-python.c:940)
  ==2106591==    by 0x716327: python_process_general_event (trace-event-python.c:1499)
  ==2106591==    by 0x7164E1: python_process_event (trace-event-python.c:1531)
  ==2106591==    by 0x44F9AF: process_sample_event (builtin-script.c:2549)
  ==2106591==    by 0x6294DC: evlist__deliver_sample (session.c:1534)
  ==2106591==    by 0x6296D0: machines__deliver_event (session.c:1573)
  ==2106591==    by 0x629C39: perf_session__deliver_event (session.c:1655)
  ==2106591==    by 0x625830: ordered_events__deliver_event (session.c:193)
  ==2106591==    by 0x630B23: do_flush (ordered-events.c:245)
  ==2106591==    by 0x630E7A: __ordered_events__flush (ordered-events.c:324)
  ==2106591==
  73056 total, 29 ignored

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240905151058.2127122-2-ak@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/scripting-engines/trace-event-python.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 6971dd6c231f..d7183134b669 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -762,6 +762,8 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch
 	}
 }
 
+#define MAX_REG_SIZE 128
+
 static int set_regs_in_dict(PyObject *dict,
 			     struct perf_sample *sample,
 			     struct evsel *evsel)
@@ -769,14 +771,7 @@ static int set_regs_in_dict(PyObject *dict,
 	struct perf_event_attr *attr = &evsel->core.attr;
 	const char *arch = perf_env__arch(evsel__env(evsel));
 
-	/*
-	 * Here value 28 is a constant size which can be used to print
-	 * one register value and its corresponds to:
-	 * 16 chars is to specify 64 bit register in hexadecimal.
-	 * 2 chars is for appending "0x" to the hexadecimal value and
-	 * 10 chars is for register name.
-	 */
-	int size = __sw_hweight64(attr->sample_regs_intr) * 28;
+	int size = (__sw_hweight64(attr->sample_regs_intr) * MAX_REG_SIZE) + 1;
 	char *bf = malloc(size);
 	if (!bf)
 		return -1;
-- 
cgit v1.2.3


From 6e05d28ff232cf445cc6ae59336b7f2081ef9b96 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 5 Sep 2024 10:07:35 -0700
Subject: perf mem: Check mem_events for all eligible PMUs

The current perf_pmu__mem_events_init() only checks the availability of
the mem_events for the first eligible PMU. It works for non-hybrid
machines and hybrid machines that have the same mem_events.

However, it may bring issues if a hybrid machine has a different
mem_events on different PMU, e.g., Alder Lake and Raptor Lake. A
mem-loads-aux event is only required for the p-core. The mem_events on
both e-core and p-core should be checked and marked.

The issue was not found, because it's hidden by another bug, which only
records the mem-events for the e-core. The wrong check for the p-core
events didn't yell.

Fixes: abbdd79b786e036e ("perf mem: Clean up perf_mem_events__name()")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240905170737.4070743-1-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-c2c.c     |  2 +-
 tools/perf/builtin-mem.c     |  2 +-
 tools/perf/util/mem-events.c | 14 +++++++++++++-
 tools/perf/util/mem-events.h |  2 +-
 4 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index cd2bd573bfc3..cef95b2781c0 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -3261,7 +3261,7 @@ static int perf_c2c__record(int argc, const char **argv)
 		return -1;
 	}
 
-	if (perf_pmu__mem_events_init(pmu)) {
+	if (perf_pmu__mem_events_init()) {
 		pr_err("failed: memory events not supported\n");
 		return -1;
 	}
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index bb38bb5a1c26..a9f94785809e 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -88,7 +88,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem,
 		return -1;
 	}
 
-	if (perf_pmu__mem_events_init(pmu)) {
+	if (perf_pmu__mem_events_init()) {
 		pr_err("failed: memory events not supported\n");
 		return -1;
 	}
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index be048bd02f36..17f80013e574 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -192,7 +192,7 @@ static bool perf_pmu__mem_events_supported(const char *mnt, struct perf_pmu *pmu
 	return !stat(path, &st);
 }
 
-int perf_pmu__mem_events_init(struct perf_pmu *pmu)
+static int __perf_pmu__mem_events_init(struct perf_pmu *pmu)
 {
 	const char *mnt = sysfs__mount();
 	bool found = false;
@@ -219,6 +219,18 @@ int perf_pmu__mem_events_init(struct perf_pmu *pmu)
 	return found ? 0 : -ENOENT;
 }
 
+int perf_pmu__mem_events_init(void)
+{
+	struct perf_pmu *pmu = NULL;
+
+	while ((pmu = perf_pmus__scan_mem(pmu)) != NULL) {
+		if (__perf_pmu__mem_events_init(pmu))
+			return -ENOENT;
+	}
+
+	return 0;
+}
+
 void perf_pmu__mem_events_list(struct perf_pmu *pmu)
 {
 	int j;
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index ca31014d7934..a6fc2a593938 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -30,7 +30,7 @@ extern unsigned int perf_mem_events__loads_ldlat;
 extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
 
 int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str);
-int perf_pmu__mem_events_init(struct perf_pmu *pmu);
+int perf_pmu__mem_events_init(void);
 
 struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i);
 struct perf_pmu *perf_mem_events_find_pmu(void);
-- 
cgit v1.2.3


From 5ad7db2c3f941cde3045ce38a9c4c40b0c7d56b9 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 5 Sep 2024 10:07:36 -0700
Subject: perf mem: Fix missed p-core mem events on ADL and RPL

The p-core mem events are missed when launching 'perf mem record' on ADL
and RPL.

  root@number:~# perf mem record sleep 1
  Memory events are enabled on a subset of CPUs: 16-27
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.032 MB perf.data ]
  root@number:~# perf evlist
  cpu_atom/mem-loads,ldlat=30/P
  cpu_atom/mem-stores/P
  dummy:u

A variable 'record' in the 'struct perf_mem_event' is to indicate
whether a mem event in a mem_events[] should be recorded. The current
code only configure the variable for the first eligible PMU.

It's good enough for a non-hybrid machine or a hybrid machine which has
the same mem_events[].

However, if a different mem_events[] is used for different PMUs on a
hybrid machine, e.g., ADL or RPL, the 'record' for the second PMU never
get a chance to be set.

The mem_events[] of the second PMU are always ignored.

'perf mem' doesn't support the per-PMU configuration now. A per-PMU
mem_events[] 'record' variable doesn't make sense. Make it global.

That could also avoid searching for the per-PMU mem_events[] via
perf_pmu__mem_events_ptr every time.

Committer testing:

  root@number:~# perf evlist -g
  cpu_atom/mem-loads,ldlat=30/P
  cpu_atom/mem-stores/P
  {cpu_core/mem-loads-aux/,cpu_core/mem-loads,ldlat=30/}
  cpu_core/mem-stores/P
  dummy:u
  root@number:~#

The :S for '{cpu_core/mem-loads-aux/,cpu_core/mem-loads,ldlat=30/}' is
not being added by 'perf evlist -g', to be checked.

Fixes: abbdd79b786e036e ("perf mem: Clean up perf_mem_events__name()")
Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Closes: https://lore.kernel.org/lkml/Zthu81fA3kLC2CS2@x1/
Link: https://lore.kernel.org/r/20240905170737.4070743-2-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-c2c.c     | 12 ++++--------
 tools/perf/builtin-mem.c     | 17 ++++++-----------
 tools/perf/util/mem-events.c |  6 ++++--
 tools/perf/util/mem-events.h |  2 +-
 4 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index cef95b2781c0..15e1fce71c72 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -3285,19 +3285,15 @@ static int perf_c2c__record(int argc, const char **argv)
 		 * PERF_MEM_EVENTS__LOAD_STORE if it is supported.
 		 */
 		if (e->tag) {
-			e->record = true;
+			perf_mem_record[PERF_MEM_EVENTS__LOAD_STORE] = true;
 			rec_argv[i++] = "-W";
 		} else {
-			e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD);
-			e->record = true;
-
-			e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__STORE);
-			e->record = true;
+			perf_mem_record[PERF_MEM_EVENTS__LOAD] = true;
+			perf_mem_record[PERF_MEM_EVENTS__STORE] = true;
 		}
 	}
 
-	e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD);
-	if (e->record)
+	if (perf_mem_record[PERF_MEM_EVENTS__LOAD])
 		rec_argv[i++] = "-W";
 
 	rec_argv[i++] = "-d";
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index a9f94785809e..615e905450b3 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -117,22 +117,17 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem,
 	if (e->tag &&
 	    (mem->operation & MEM_OPERATION_LOAD) &&
 	    (mem->operation & MEM_OPERATION_STORE)) {
-		e->record = true;
+		perf_mem_record[PERF_MEM_EVENTS__LOAD_STORE] = true;
 		rec_argv[i++] = "-W";
 	} else {
-		if (mem->operation & MEM_OPERATION_LOAD) {
-			e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD);
-			e->record = true;
-		}
+		if (mem->operation & MEM_OPERATION_LOAD)
+			perf_mem_record[PERF_MEM_EVENTS__LOAD] = true;
 
-		if (mem->operation & MEM_OPERATION_STORE) {
-			e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__STORE);
-			e->record = true;
-		}
+		if (mem->operation & MEM_OPERATION_STORE)
+			perf_mem_record[PERF_MEM_EVENTS__STORE] = true;
 	}
 
-	e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD);
-	if (e->record)
+	if (perf_mem_record[PERF_MEM_EVENTS__LOAD])
 		rec_argv[i++] = "-W";
 
 	rec_argv[i++] = "-d";
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 17f80013e574..051feb93ed8d 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -29,6 +29,8 @@ struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
 };
 #undef E
 
+bool perf_mem_record[PERF_MEM_EVENTS__MAX] = { 0 };
+
 static char mem_loads_name[100];
 static char mem_stores_name[100];
 
@@ -163,7 +165,7 @@ int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str)
 				continue;
 
 			if (strstr(e->tag, tok))
-				e->record = found = true;
+				perf_mem_record[j] = found = true;
 		}
 
 		tok = strtok_r(NULL, ",", &saveptr);
@@ -261,7 +263,7 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr)
 		for (int j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
 			e = perf_pmu__mem_events_ptr(pmu, j);
 
-			if (!e->record)
+			if (!perf_mem_record[j])
 				continue;
 
 			if (!e->supported) {
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index a6fc2a593938..8dc27db9fd52 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -6,7 +6,6 @@
 #include <linux/types.h>
 
 struct perf_mem_event {
-	bool		record;
 	bool		supported;
 	bool		ldlat;
 	u32		aux_event;
@@ -28,6 +27,7 @@ struct perf_pmu;
 
 extern unsigned int perf_mem_events__loads_ldlat;
 extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
+extern bool perf_mem_record[PERF_MEM_EVENTS__MAX];
 
 int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str);
 int perf_pmu__mem_events_init(void);
-- 
cgit v1.2.3


From 003265bb6f028d7bcd7cbd92d6ba2b4e26382796 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 5 Sep 2024 10:07:37 -0700
Subject: perf mem: Fix the wrong reference in parse_record_events()

A segmentation fault can be triggered when running
'perf mem record -e ldlat-loads'

The commit 35b38a71c92fa033 ("perf mem: Rework command option handling")
moves the OPT_CALLBACK of event from __cmd_record() to cmd_mem().

When invoking the __cmd_record(), the 'mem' has been referenced (&).

So the &mem passed into the parse_record_events() is a double reference
(&&) of the original struct perf_mem mem.

But in the cmd_mem(), the &mem is the single reference (&) of the
original struct perf_mem mem.

Fixes: 35b38a71c92fa033 ("perf mem: Rework command option handling")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240905170737.4070743-3-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 615e905450b3..651188c1d825 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -47,7 +47,7 @@ struct perf_mem {
 static int parse_record_events(const struct option *opt,
 			       const char *str, int unset __maybe_unused)
 {
-	struct perf_mem *mem = *(struct perf_mem **)opt->value;
+	struct perf_mem *mem = (struct perf_mem *)opt->value;
 	struct perf_pmu *pmu;
 
 	pmu = perf_mem_events_find_pmu();
-- 
cgit v1.2.3


From 7beaf1da074f7ea25454d6c11da142c3892d3c4e Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 5 Sep 2024 12:02:31 -0600
Subject: selftests:resctrl: Fix build failure on archs without __cpuid_count()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When resctrl is built on architectures without __cpuid_count()
support, build fails. resctrl uses __cpuid_count() defined in
kselftest.h.

Even though the problem is seen while building resctrl on aarch64,
this error can be seen on any platform that doesn't support CPUID.

CPUID is a x86/x86-64 feature and code paths with CPUID asm commands
will fail to build on all other architectures.

All others tests call __cpuid_count() do so from x86/x86_64 code paths
when _i386__ or __x86_64__ are defined. resctrl is an exception.

Fix the problem by defining __cpuid_count() only when __i386__ or
__x86_64__ are defined in kselftest.h and changing resctrl to call
__cpuid_count() only when __i386__ or __x86_64__ are defined.

In file included from resctrl.h:24,
                 from cat_test.c:11:
In function ‘arch_supports_noncont_cat’,
    inlined from ‘noncont_cat_run_test’ at cat_test.c:326:6:
../kselftest.h:74:9: error: impossible constraint in ‘asm’
   74 |         __asm__ __volatile__ ("cpuid\n\t"                               \
      |         ^~~~~~~
cat_test.c:304:17: note: in expansion of macro ‘__cpuid_count’
  304 |                 __cpuid_count(0x10, 1, eax, ebx, ecx, edx);
      |                 ^~~~~~~~~~~~~
../kselftest.h:74:9: error: impossible constraint in ‘asm’
   74 |         __asm__ __volatile__ ("cpuid\n\t"                               \
      |         ^~~~~~~
cat_test.c:306:17: note: in expansion of macro ‘__cpuid_count’
  306 |                 __cpuid_count(0x10, 2, eax, ebx, ecx, edx);

Fixes: ae638551ab64 ("selftests/resctrl: Add non-contiguous CBMs CAT test")
Reported-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Closes: https://lore.kernel.org/lkml/20240809071059.265914-1-usama.anjum@collabora.com/
Reported-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest.h        | 2 ++
 tools/testing/selftests/resctrl/cat_test.c | 7 +++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index b8967b6e29d5..e195ec156859 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -61,6 +61,7 @@
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif
 
+#if defined(__i386__) || defined(__x86_64__) /* arch */
 /*
  * gcc cpuid.h provides __cpuid_count() since v4.4.
  * Clang/LLVM cpuid.h provides  __cpuid_count() since v3.4.0.
@@ -75,6 +76,7 @@
 			      : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
 			      : "0" (level), "2" (count))
 #endif
+#endif /* end arch */
 
 /* define kselftest exit codes */
 #define KSFT_PASS  0
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 742782438ca3..94cfdba5308d 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -290,12 +290,12 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param
 
 static bool arch_supports_noncont_cat(const struct resctrl_test *test)
 {
-	unsigned int eax, ebx, ecx, edx;
-
 	/* AMD always supports non-contiguous CBM. */
 	if (get_vendor() == ARCH_AMD)
 		return true;
 
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+	unsigned int eax, ebx, ecx, edx;
 	/* Intel support for non-contiguous CBM needs to be discovered. */
 	if (!strcmp(test->resource, "L3"))
 		__cpuid_count(0x10, 1, eax, ebx, ecx, edx);
@@ -305,6 +305,9 @@ static bool arch_supports_noncont_cat(const struct resctrl_test *test)
 		return false;
 
 	return ((ecx >> 3) & 1);
+#endif /* end arch */
+
+	return false;
 }
 
 static int noncont_cat_run_test(const struct resctrl_test *test,
-- 
cgit v1.2.3


From 80e67f1802d0fc21543216557a68320c71d7dbe1 Mon Sep 17 00:00:00 2001
From: "John B. Wyatt IV" <jwyatt@redhat.com>
Date: Fri, 6 Sep 2024 09:00:06 -0400
Subject: pm:cpupower: Add error warning when SWIG is not installed

Add error message to better explain to the user when SWIG and
python-config is missing from the path. Makefile was cleaned up
and unneeded elements were removed.

Suggested-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: John B. Wyatt IV <jwyatt@redhat.com>
Signed-off-by: John B. Wyatt IV <sageofredondo@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/bindings/python/Makefile | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile
index d0418f902795..dc09c5b66ead 100644
--- a/tools/power/cpupower/bindings/python/Makefile
+++ b/tools/power/cpupower/bindings/python/Makefile
@@ -4,26 +4,28 @@
 # This Makefile expects you have already run the makefile for cpupower to build
 # the .o files in the lib directory for the bindings to be created.
 
-CC=gcc
-
-LIB_DIR = ../../lib
-BIND_DIR = .
-PY_INCLUDE := $(firstword $(shell python-config --includes))
-#PY_INCLUDE = $(shell python-config --includes | awk '{ print $1 }')
+CC := gcc
+HAVE_SWIG := $(shell if which swig >/dev/null 2>&1; then echo 1; else echo 0; fi)
+HAVE_PYCONFIG := $(shell if which python-config >/dev/null 2>&1; then echo 1; else echo 0; fi)
 
+LIB_DIR := ../../lib
+PY_INCLUDE = $(firstword $(shell python-config --includes))
 OBJECTS_LIB = $(wildcard $(LIB_DIR)/*.o)
-OBJECTS_BIND = $(wildcard $(BIND_DIR)/*.o)
 
 all: _raw_pylibcpupower.so
 
 _raw_pylibcpupower.so: raw_pylibcpupower_wrap.o
-	$(CC) -shared $(OBJECTS_LIB) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so # raw_pylibcpupower_wrap.o
-#	$(CC) -shared $(OBJECTS_BIND) $(OBJECTS_LIB) -o _raw_pylibcpupower.so # raw_pylibcpupower_wrap.o
+	$(CC) -shared $(OBJECTS_LIB) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so
 
 raw_pylibcpupower_wrap.o: raw_pylibcpupower_wrap.c
 	$(CC) -fPIC -c raw_pylibcpupower_wrap.c $(PY_INCLUDE)
 
 raw_pylibcpupower_wrap.c: raw_pylibcpupower.i
+ifeq ($(HAVE_SWIG),0)
+	$(error "swig was not found. Make sure you have it installed and in the PATH to generate the bindings.")
+else ifeq ($(HAVE_PYCONFIG),0)
+	$(error "python-config was not found. Make sure you have it installed and in the PATH to generate the bindings.")
+endif
 	swig -python raw_pylibcpupower.i
 
 # Will only clean the bindings folder; will not clean the actual cpupower folder
-- 
cgit v1.2.3


From af1ec38c6ccc31ec963ac4bcf8f6a7d8f44d210a Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Fri, 6 Sep 2024 10:52:59 +0800
Subject: selftests/timers: Remove unused NSEC_PER_SEC macro

By reading the code, I found the macro NSEC_PER_SEC
is never referenced in the code. Just remove it.

Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: John Stultz <jstultz@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/change_skew.c      | 3 ---
 tools/testing/selftests/timers/skew_consistency.c | 2 --
 2 files changed, 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c
index 4421cd562c24..18e794a46c23 100644
--- a/tools/testing/selftests/timers/change_skew.c
+++ b/tools/testing/selftests/timers/change_skew.c
@@ -30,9 +30,6 @@
 #include <time.h>
 #include "../kselftest.h"
 
-#define NSEC_PER_SEC 1000000000LL
-
-
 int change_skew_test(int ppm)
 {
 	struct timex tx;
diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c
index c8e6bffe4e0a..83450145fe65 100644
--- a/tools/testing/selftests/timers/skew_consistency.c
+++ b/tools/testing/selftests/timers/skew_consistency.c
@@ -36,8 +36,6 @@
 #include <sys/wait.h>
 #include "../kselftest.h"
 
-#define NSEC_PER_SEC 1000000000LL
-
 int main(int argc, char **argv)
 {
 	struct timex tx;
-- 
cgit v1.2.3


From f8c6b7913dfaa67475883f94261c278adbcaa0ae Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Fri, 6 Sep 2024 14:24:53 +0100
Subject: bpftool: Improve btf c dump sorting stability

Existing algorithm for BTF C dump sorting uses only types and names of
the structs and unions for ordering. As dump contains structs with the
same names but different contents, relative to each other ordering of
those structs will be accidental.
This patch addresses this problem by introducing a new sorting field
that contains hash of the struct/union field names and types to
disambiguate comparison of the non-unique named structs.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240906132453.146085-1-mykyta.yatsenko5@gmail.com
---
 tools/bpf/bpftool/btf.c | 80 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 75 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 3b57ba095ab6..7d2af1ff3c8d 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -50,6 +50,7 @@ struct sort_datum {
 	int type_rank;
 	const char *sort_name;
 	const char *own_name;
+	__u64 disambig_hash;
 };
 
 static const char *btf_int_enc_str(__u8 encoding)
@@ -584,20 +585,88 @@ static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool f
 	return NULL;
 }
 
+static __u64 hasher(__u64 hash, __u64 val)
+{
+	return hash * 31 + val;
+}
+
+static __u64 btf_name_hasher(__u64 hash, const struct btf *btf, __u32 name_off)
+{
+	if (!name_off)
+		return hash;
+
+	return hasher(hash, str_hash(btf__name_by_offset(btf, name_off)));
+}
+
+static __u64 btf_type_disambig_hash(const struct btf *btf, __u32 id, bool include_members)
+{
+	const struct btf_type *t = btf__type_by_id(btf, id);
+	int i;
+	size_t hash = 0;
+
+	hash = btf_name_hasher(hash, btf, t->name_off);
+
+	switch (btf_kind(t)) {
+	case BTF_KIND_ENUM:
+	case BTF_KIND_ENUM64:
+		for (i = 0; i < btf_vlen(t); i++) {
+			__u32 name_off = btf_is_enum(t) ?
+				btf_enum(t)[i].name_off :
+				btf_enum64(t)[i].name_off;
+
+			hash = btf_name_hasher(hash, btf, name_off);
+		}
+		break;
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+		if (!include_members)
+			break;
+		for (i = 0; i < btf_vlen(t); i++) {
+			const struct btf_member *m = btf_members(t) + i;
+
+			hash = btf_name_hasher(hash, btf, m->name_off);
+			/* resolve field type's name and hash it as well */
+			hash = hasher(hash, btf_type_disambig_hash(btf, m->type, false));
+		}
+		break;
+	case BTF_KIND_TYPE_TAG:
+	case BTF_KIND_CONST:
+	case BTF_KIND_PTR:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_DECL_TAG:
+		hash = hasher(hash, btf_type_disambig_hash(btf, t->type, include_members));
+		break;
+	case BTF_KIND_ARRAY: {
+		struct btf_array *arr = btf_array(t);
+
+		hash = hasher(hash, arr->nelems);
+		hash = hasher(hash, btf_type_disambig_hash(btf, arr->type, include_members));
+		break;
+	}
+	default:
+		break;
+	}
+	return hash;
+}
+
 static int btf_type_compare(const void *left, const void *right)
 {
 	const struct sort_datum *d1 = (const struct sort_datum *)left;
 	const struct sort_datum *d2 = (const struct sort_datum *)right;
 	int r;
 
-	if (d1->type_rank != d2->type_rank)
-		return d1->type_rank < d2->type_rank ? -1 : 1;
-
-	r = strcmp(d1->sort_name, d2->sort_name);
+	r = d1->type_rank - d2->type_rank;
+	r = r ?: strcmp(d1->sort_name, d2->sort_name);
+	r = r ?: strcmp(d1->own_name, d2->own_name);
 	if (r)
 		return r;
 
-	return strcmp(d1->own_name, d2->own_name);
+	if (d1->disambig_hash != d2->disambig_hash)
+		return d1->disambig_hash < d2->disambig_hash ? -1 : 1;
+
+	return d1->index - d2->index;
 }
 
 static struct sort_datum *sort_btf_c(const struct btf *btf)
@@ -618,6 +687,7 @@ static struct sort_datum *sort_btf_c(const struct btf *btf)
 		d->type_rank = btf_type_rank(btf, i, false);
 		d->sort_name = btf_type_sort_name(btf, i, false);
 		d->own_name = btf__name_by_offset(btf, t->name_off);
+		d->disambig_hash = btf_type_disambig_hash(btf, i, true);
 	}
 
 	qsort(datums, n, sizeof(struct sort_datum), btf_type_compare);
-- 
cgit v1.2.3


From 8a3f14bb1e944f496d1a16096435ddb2e12e4753 Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Fri, 6 Sep 2024 14:48:14 +0100
Subject: libbpf: Workaround (another) -Wmaybe-uninitialized false positive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We get this with GCC 15 -O3 (at least):
```
libbpf.c: In function ‘bpf_map__init_kern_struct_ops’:
libbpf.c:1109:18: error: ‘mod_btf’ may be used uninitialized [-Werror=maybe-uninitialized]
 1109 |         kern_btf = mod_btf ? mod_btf->btf : obj->btf_vmlinux;
      |         ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
libbpf.c:1094:28: note: ‘mod_btf’ was declared here
 1094 |         struct module_btf *mod_btf;
      |                            ^~~~~~~
In function ‘find_struct_ops_kern_types’,
    inlined from ‘bpf_map__init_kern_struct_ops’ at libbpf.c:1102:8:
libbpf.c:982:21: error: ‘btf’ may be used uninitialized [-Werror=maybe-uninitialized]
  982 |         kern_type = btf__type_by_id(btf, kern_type_id);
      |                     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
libbpf.c: In function ‘bpf_map__init_kern_struct_ops’:
libbpf.c:967:21: note: ‘btf’ was declared here
  967 |         struct btf *btf;
      |                     ^~~
```

This is similar to the other libbpf fix from a few weeks ago for
the same modelling-errno issue (fab45b962749184e1a1a57c7c583782b78fad539).

Signed-off-by: Sam James <sam@gentoo.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://bugs.gentoo.org/939106
Link: https://lore.kernel.org/bpf/f6962729197ae7cdf4f6d1512625bd92f2322d31.1725630494.git.sam@gentoo.org
---
 tools/lib/bpf/libbpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 27ad3c6ee868..4f29e06c2641 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -986,7 +986,7 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw,
 {
 	const struct btf_type *kern_type, *kern_vtype;
 	const struct btf_member *kern_data_member;
-	struct btf *btf;
+	struct btf *btf = NULL;
 	__s32 kern_vtype_id, kern_type_id;
 	char tname[256];
 	__u32 i;
@@ -1116,7 +1116,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map)
 	const struct btf *btf = obj->btf;
 	struct bpf_struct_ops *st_ops;
 	const struct btf *kern_btf;
-	struct module_btf *mod_btf;
+	struct module_btf *mod_btf = NULL;
 	void *data, *kern_data;
 	const char *tname;
 	int err;
-- 
cgit v1.2.3


From 94e86b174d103d941b4afc4f016af8af9e5352fa Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Fri, 6 Sep 2024 02:13:33 -0700
Subject: tools/hv: Add memory allocation check in hv_fcopy_start

Added error handling for memory allocation failures
of file_name and path_name.

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Tested-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Link: https://lore.kernel.org/r/20240906091333.11419-1-zhujun2@cmss.chinamobile.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20240906091333.11419-1-zhujun2@cmss.chinamobile.com>
---
 tools/hv/hv_fcopy_uio_daemon.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c
index 3ce316cc9f97..7a00f3066a98 100644
--- a/tools/hv/hv_fcopy_uio_daemon.c
+++ b/tools/hv/hv_fcopy_uio_daemon.c
@@ -296,6 +296,13 @@ static int hv_fcopy_start(struct hv_start_fcopy *smsg_in)
 	file_name = (char *)malloc(file_size * sizeof(char));
 	path_name = (char *)malloc(path_size * sizeof(char));
 
+	if (!file_name || !path_name) {
+		free(file_name);
+		free(path_name);
+		syslog(LOG_ERR, "Can't allocate memory for file name and/or path name");
+		return HV_E_FAIL;
+	}
+
 	wcstoutf8(file_name, (__u16 *)in_file_name, file_size);
 	wcstoutf8(path_name, (__u16 *)in_path_name, path_size);
 
-- 
cgit v1.2.3


From 8c9c01ce695eea84d19482e7429e3d54ceb7585c Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.ibm.com>
Date: Mon, 12 Aug 2024 15:11:52 +0530
Subject: selftests/powerpc: Allow building without static libc

Currently exec-target.c is linked statically with libc, which on Fedora
at least requires installing an additional package (glibc-static).

If that package is not installed the build fails with:

    CC       exec_target
  /usr/bin/ld: cannot find -lc: No such file or directory
  collect2: error: ld returned 1 exit status

All exec_target.c does is call sys_exit, which can be done easily enough
using inline assembly, and removes the requirement for a static libc to
be installed.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20240812094152.418586-1-maddy@linux.ibm.com
---
 tools/testing/selftests/powerpc/benchmarks/Makefile      |  2 +-
 tools/testing/selftests/powerpc/benchmarks/exec_target.c | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile
index 1321922038d0..ca4483c238b9 100644
--- a/tools/testing/selftests/powerpc/benchmarks/Makefile
+++ b/tools/testing/selftests/powerpc/benchmarks/Makefile
@@ -18,4 +18,4 @@ $(OUTPUT)/context_switch: LDLIBS += -lpthread
 
 $(OUTPUT)/fork: LDLIBS += -lpthread
 
-$(OUTPUT)/exec_target: CFLAGS += -static -nostartfiles
+$(OUTPUT)/exec_target: CFLAGS += -nostartfiles
diff --git a/tools/testing/selftests/powerpc/benchmarks/exec_target.c b/tools/testing/selftests/powerpc/benchmarks/exec_target.c
index c14b0fc1edde..a6408d3f26cd 100644
--- a/tools/testing/selftests/powerpc/benchmarks/exec_target.c
+++ b/tools/testing/selftests/powerpc/benchmarks/exec_target.c
@@ -7,10 +7,22 @@
  */
 
 #define _GNU_SOURCE
-#include <unistd.h>
 #include <sys/syscall.h>
 
 void _start(void)
 {
-	syscall(SYS_exit, 0);
+	asm volatile (
+		"li %%r0, %[sys_exit];"
+		"li %%r3, 0;"
+		"sc;"
+		:
+		: [sys_exit] "i" (SYS_exit)
+		/*
+		 * "sc" will clobber r0, r3-r13, cr0, ctr, xer and memory.
+		 * Even though sys_exit never returns, handle clobber
+		 * registers.
+		 */
+		: "r0", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+		  "r11", "r12", "r13", "cr0", "ctr", "xer", "memory"
+	);
 }
-- 
cgit v1.2.3


From 5c6e3d5a5da118be2ae074bd70d111994147c708 Mon Sep 17 00:00:00 2001
From: Yanfei Xu <yanfei.xu@intel.com>
Date: Wed, 28 Aug 2024 16:42:29 +0800
Subject: cxl/pci: Remove duplicated implementation of waiting for
 memory_info_valid

commit ce17ad0d5498 ("cxl: Wait Memory_Info_Valid before access memory
related info") added another implementation, which is
cxl_dvsec_mem_range_valid(), of waiting for memory_info_valid without
realizing it duplicated wait_for_valid(). Remove wait_for_valid() and
retain cxl_dvsec_mem_range_valid() as the former is hardcoded to check
only the Memory_Info_Valid bit of DVSEC range 1, while the latter allows
for selection between DVSEC range 1 or 2 via parameter.

Suggested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Yanfei Xu <yanfei.xu@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20240828084231.1378789-3-yanfei.xu@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/pci.c        | 41 +++++------------------------------------
 drivers/cxl/cxl.h             |  2 +-
 drivers/cxl/port.c            |  2 +-
 tools/testing/cxl/test/mock.c |  4 ++--
 4 files changed, 9 insertions(+), 40 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 075a2b2d2ea0..37e537e50b34 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -211,37 +211,6 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_await_media_ready, CXL);
 
-static int wait_for_valid(struct pci_dev *pdev, int d)
-{
-	u32 val;
-	int rc;
-
-	/*
-	 * Memory_Info_Valid: When set, indicates that the CXL Range 1 Size high
-	 * and Size Low registers are valid. Must be set within 1 second of
-	 * deassertion of reset to CXL device. Likely it is already set by the
-	 * time this runs, but otherwise give a 1.5 second timeout in case of
-	 * clock skew.
-	 */
-	rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val);
-	if (rc)
-		return rc;
-
-	if (val & CXL_DVSEC_MEM_INFO_VALID)
-		return 0;
-
-	msleep(1500);
-
-	rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val);
-	if (rc)
-		return rc;
-
-	if (val & CXL_DVSEC_MEM_INFO_VALID)
-		return 0;
-
-	return -ETIMEDOUT;
-}
-
 static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val)
 {
 	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
@@ -322,11 +291,13 @@ static int devm_cxl_enable_hdm(struct device *host, struct cxl_hdm *cxlhdm)
 	return devm_add_action_or_reset(host, disable_hdm, cxlhdm);
 }
 
-int cxl_dvsec_rr_decode(struct device *dev, int d,
+int cxl_dvsec_rr_decode(struct device *dev, struct cxl_port *port,
 			struct cxl_endpoint_dvsec_info *info)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
 	int hdm_count, rc, i, ranges = 0;
+	int d = cxlds->cxl_dvsec;
 	u16 cap, ctrl;
 
 	if (!d) {
@@ -353,11 +324,9 @@ int cxl_dvsec_rr_decode(struct device *dev, int d,
 	if (!hdm_count || hdm_count > 2)
 		return -EINVAL;
 
-	rc = wait_for_valid(pdev, d);
-	if (rc) {
-		dev_dbg(dev, "Failure awaiting MEM_INFO_VALID (%d)\n", rc);
+	rc = cxl_dvsec_mem_range_valid(cxlds, 0);
+	if (rc)
 		return rc;
-	}
 
 	/*
 	 * The current DVSEC values are moot if the memory capability is
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 2890f7e2aad5..0fc96f8bf15c 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -811,7 +811,7 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port,
 int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
 				struct cxl_endpoint_dvsec_info *info);
 int devm_cxl_add_passthrough_decoder(struct cxl_port *port);
-int cxl_dvsec_rr_decode(struct device *dev, int dvsec,
+int cxl_dvsec_rr_decode(struct device *dev, struct cxl_port *port,
 			struct cxl_endpoint_dvsec_info *info);
 
 bool is_cxl_region(struct device *dev);
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index d7d5d982ce69..861dde65768f 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -98,7 +98,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port)
 	struct cxl_port *root;
 	int rc;
 
-	rc = cxl_dvsec_rr_decode(cxlds->dev, cxlds->cxl_dvsec, &info);
+	rc = cxl_dvsec_rr_decode(cxlds->dev, port, &info);
 	if (rc < 0)
 		return rc;
 
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index bbd7d938156d..f4ce96cc11d4 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -228,7 +228,7 @@ int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds,
 }
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_hdm_decode_init, CXL);
 
-int __wrap_cxl_dvsec_rr_decode(struct device *dev, int dvsec,
+int __wrap_cxl_dvsec_rr_decode(struct device *dev, struct cxl_port *port,
 			       struct cxl_endpoint_dvsec_info *info)
 {
 	int rc = 0, index;
@@ -237,7 +237,7 @@ int __wrap_cxl_dvsec_rr_decode(struct device *dev, int dvsec,
 	if (ops && ops->is_mock_dev(dev))
 		rc = 0;
 	else
-		rc = cxl_dvsec_rr_decode(dev, dvsec, info);
+		rc = cxl_dvsec_rr_decode(dev, port, info);
 	put_cxl_mock_ops(index);
 
 	return rc;
-- 
cgit v1.2.3


From c1632cc5ed386cc8d85693ef578a106f4fb1932c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 6 Sep 2024 15:37:43 -0300
Subject: perf trace augmented_syscalls.bpf: Move the renameat aumenter to
 renameat2, temporarily

While trying to shape Howard Chu's generic BPF augmenter transition into
the codebase I got stuck with the renameat2 syscall.

Until I noticed that the attempt at reusing augmenters were making it
use the 'openat' syscall augmenter, that collect just one string syscall
arg, for the 'renameat2' syscall, that takes two strings.

So, for the moment, just to help in this transition period, since
'renameat2' is what is used these days in the 'mv' utility, just make
the BPF collector be associated with the more widely used syscall,
hopefully the transition to Howard's generic BPF augmenter will cure
this, so get this out of the way for now!

So now we still have that odd "reuse", but for something we're not
testing so won't get in the way anymore:

  root@number:~# rm -f 987654 ; touch 123456 ; perf trace -vv -e rename* mv 123456 987654 |& grep renameat
  Reusing "openat" BPF sys_enter augmenter for "renameat"
       0.000 ( 0.079 ms): mv/1158612 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0
  root@number:~#

Reviewed-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/CAP-5=fXjGYs=tpBgETK-P9U-CuXssytk9pSnTXpfphrmmOydWA@mail.gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 0acbd74e8c76..0f9bd2690d4e 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -260,8 +260,8 @@ int sys_enter_rename(struct syscall_enter_args *args)
 	return augmented__output(args, augmented_args, len);
 }
 
-SEC("tp/syscalls/sys_enter_renameat")
-int sys_enter_renameat(struct syscall_enter_args *args)
+SEC("tp/syscalls/sys_enter_renameat2")
+int sys_enter_renameat2(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[1],
-- 
cgit v1.2.3


From c90a88d33a23a8b3c58ee0e1d18d7392244b9b03 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 6 Sep 2024 16:26:45 -0300
Subject: perf trace: Use a common encoding for augmented arguments, with size
 + error + payload

We were using a more compact format, without explicitely encoding the
size and possible error in the payload for an argument.

To do it generically, at least as Howard Chu did in his GSoC activities,
it is more convenient to use the same model that was being used for
string arguments, passing { size, error, payload }.

So use that for the non string syscall args we have so far:

  struct timespec
  struct perf_event_attr
  struct sockaddr (this one has even a variable size)

With this in place we have the userspace pretty printers:

  perf_event_attr___scnprintf()
  syscall_arg__scnprintf_augmented_sockaddr()
  syscall_arg__scnprintf_augmented_timespec()

Ready to have the generic BPF collector in tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
sending its generic payload and thus we'll use them instead of a generic
libbpf btf_dump interface that doesn't know about about the sockaddr
mux, perf_event_attr non-trivial fields (sample_type, etc), leaving it
as a (useful) fallback that prints just basic types until we put in
place a more sophisticated pretty printer infrastructure that associates
synthesized enums to struct fields using the header scrapers we have in
tools/perf/trace/beauty/, some of them in this list:

  $ ls tools/perf/trace/beauty/*.sh
  tools/perf/trace/beauty/arch_errno_names.sh
  tools/perf/trace/beauty/kcmp_type.sh
  tools/perf/trace/beauty/perf_ioctl.sh
  tools/perf/trace/beauty/statx_mask.sh
  tools/perf/trace/beauty/clone.sh
  tools/perf/trace/beauty/kvm_ioctl.sh
  tools/perf/trace/beauty/pkey_alloc_access_rights.sh
  tools/perf/trace/beauty/sync_file_range.sh
  tools/perf/trace/beauty/drm_ioctl.sh
  tools/perf/trace/beauty/madvise_behavior.sh
  tools/perf/trace/beauty/prctl_option.sh
  tools/perf/trace/beauty/usbdevfs_ioctl.sh
  tools/perf/trace/beauty/fadvise.sh
  tools/perf/trace/beauty/mmap_flags.sh
  tools/perf/trace/beauty/rename_flags.sh
  tools/perf/trace/beauty/vhost_virtio_ioctl.sh
  tools/perf/trace/beauty/fs_at_flags.sh
  tools/perf/trace/beauty/mmap_prot.sh
  tools/perf/trace/beauty/sndrv_ctl_ioctl.sh
  tools/perf/trace/beauty/x86_arch_prctl.sh
  tools/perf/trace/beauty/fsconfig.sh
  tools/perf/trace/beauty/mount_flags.sh
  tools/perf/trace/beauty/sndrv_pcm_ioctl.sh
  tools/perf/trace/beauty/fsmount.sh
  tools/perf/trace/beauty/move_mount_flags.sh
  tools/perf/trace/beauty/sockaddr.sh
  tools/perf/trace/beauty/fspick.sh
  tools/perf/trace/beauty/mremap_flags.sh
  tools/perf/trace/beauty/socket.sh
  $

Testing it:

  root@number:~# rm -f 987654 ; touch 123456 ; perf trace -e rename* mv 123456 987654
     0.000 ( 0.031 ms): mv/1193096 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0
  root@number:~# perf trace -e *nanosleep sleep 1.2345678901
       0.000 (1234.654 ms): sleep/1192697 clock_nanosleep(rqtp: { .tv_sec: 1, .tv_nsec: 234567891 }, rmtp: 0x7ffe1ea80460) = 0
  root@number:~# perf trace -e perf_event_open* perf stat -e cpu-clock sleep 1
       0.000 ( 0.011 ms): perf/1192701 perf_event_open(attr_uptr: { type: 1 (software), size: 136, config: 0 (PERF_COUNT_SW_CPU_CLOCK), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 1192702 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 3

   Performance counter stats for 'sleep 1':

                0.51 msec cpu-clock                        #    0.001 CPUs utilized

         1.001242090 seconds time elapsed

         0.000000000 seconds user
         0.001010000 seconds sys

  root@number:~# perf trace -e connect* ping -c 1 bsky.app
       0.000 ( 0.130 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0
      23.907 ( 0.006 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.20.108.158 }, addrlen: 16) = 0
      23.915 PING bsky.app (3.20.108.158) 56(84) bytes of data.
  ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
      23.917 ( 0.002 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.12.170.30 }, addrlen: 16) = 0
      23.921 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
      23.923 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 18.217.70.179 }, addrlen: 16) = 0
      23.925 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
      23.927 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.132.20.46 }, addrlen: 16) = 0
      23.930 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
      23.931 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.142.89.165 }, addrlen: 16) = 0
      23.934 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
      23.935 ( 0.002 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 18.119.147.159 }, addrlen: 16) = 0
      23.938 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
      23.940 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.22.38.164 }, addrlen: 16) = 0
      23.942 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
      23.944 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.13.14.133 }, addrlen: 16) = 0
      23.956 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 1025, addr: 3.20.108.158 }, addrlen: 16) = 0
  ^C
  --- bsky.app ping statistics ---
  1 packets transmitted, 0 received, 100% packet loss, time 0ms

  root@number:~#

Reviewed-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/CAP-5=fW4=2GoP6foAN6qbrCiUzy0a_TzHbd8rvDsakTPfdzvfg@mail.gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/trace/beauty/perf_event_open.c          |   2 +-
 tools/perf/trace/beauty/sockaddr.c                 |   2 +-
 tools/perf/trace/beauty/timespec.c                 |   2 +-
 .../util/bpf_skel/augmented_raw_syscalls.bpf.c     | 104 +++++++++++++--------
 4 files changed, 66 insertions(+), 44 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c
index 01ee15fe9d0c..632237128640 100644
--- a/tools/perf/trace/beauty/perf_event_open.c
+++ b/tools/perf/trace/beauty/perf_event_open.c
@@ -76,7 +76,7 @@ static size_t perf_event_attr___scnprintf(struct perf_event_attr *attr, char *bf
 
 static size_t syscall_arg__scnprintf_augmented_perf_event_attr(struct syscall_arg *arg, char *bf, size_t size)
 {
-	return perf_event_attr___scnprintf((void *)arg->augmented.args, bf, size, arg->trace->show_zeros);
+	return perf_event_attr___scnprintf((void *)arg->augmented.args->value, bf, size, arg->trace->show_zeros);
 }
 
 static size_t syscall_arg__scnprintf_perf_event_attr(char *bf, size_t size, struct syscall_arg *arg)
diff --git a/tools/perf/trace/beauty/sockaddr.c b/tools/perf/trace/beauty/sockaddr.c
index 2e0e867c0c1b..a17a27ac2a6f 100644
--- a/tools/perf/trace/beauty/sockaddr.c
+++ b/tools/perf/trace/beauty/sockaddr.c
@@ -47,7 +47,7 @@ static size_t (*af_scnprintfs[])(struct sockaddr *sa, char *bf, size_t size) = {
 
 static size_t syscall_arg__scnprintf_augmented_sockaddr(struct syscall_arg *arg, char *bf, size_t size)
 {
-	struct sockaddr *sa = (struct sockaddr *)arg->augmented.args;
+	struct sockaddr *sa = (struct sockaddr *)&arg->augmented.args->value;
 	char family[32];
 	size_t printed;
 
diff --git a/tools/perf/trace/beauty/timespec.c b/tools/perf/trace/beauty/timespec.c
index e1a61f092aad..b14ab72a2738 100644
--- a/tools/perf/trace/beauty/timespec.c
+++ b/tools/perf/trace/beauty/timespec.c
@@ -7,7 +7,7 @@
 
 static size_t syscall_arg__scnprintf_augmented_timespec(struct syscall_arg *arg, char *bf, size_t size)
 {
-	struct timespec *ts = (struct timespec *)arg->augmented.args;
+	struct timespec *ts = (struct timespec *)arg->augmented.args->value;
 
 	return scnprintf(bf, size, "{ .tv_sec: %" PRIu64 ", .tv_nsec: %" PRIu64 " }", ts->tv_sec, ts->tv_nsec);
 }
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 0f9bd2690d4e..9c7d2f855294 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -10,6 +10,9 @@
 #include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
 
+#define PERF_ALIGN(x, a)        __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __PERF_ALIGN_MASK(x, mask)      (((x)+(mask))&~(mask))
+
 /**
  * is_power_of_2() - check if a value is a power of two
  * @n: the value to check
@@ -66,19 +69,6 @@ struct syscall_exit_args {
 	long		   ret;
 };
 
-struct augmented_arg {
-	unsigned int	size;
-	int		err;
-	char		value[PATH_MAX];
-};
-
-struct pids_filtered {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__type(key, pid_t);
-	__type(value, bool);
-	__uint(max_entries, 64);
-} pids_filtered SEC(".maps");
-
 /*
  * Desired design of maximum size and alignment (see RFC2553)
  */
@@ -105,17 +95,27 @@ struct sockaddr_storage {
 	};
 };
 
-struct augmented_args_payload {
-       struct syscall_enter_args args;
-       union {
-		struct {
-			struct augmented_arg arg, arg2;
-		};
+struct augmented_arg {
+	unsigned int	size;
+	int		err;
+	union {
+		char   value[PATH_MAX];
 		struct sockaddr_storage saddr;
-		char   __data[sizeof(struct augmented_arg)];
 	};
 };
 
+struct pids_filtered {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, bool);
+	__uint(max_entries, 64);
+} pids_filtered SEC(".maps");
+
+struct augmented_args_payload {
+	struct syscall_enter_args args;
+	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
+};
+
 // We need more tmp space than the BPF stack can give us
 struct augmented_args_tmp {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
@@ -182,15 +182,17 @@ int sys_enter_connect(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *sockaddr_arg = (const void *)args->args[1];
 	unsigned int socklen = args->args[2];
-	unsigned int len = sizeof(augmented_args->args);
+	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
 
         if (augmented_args == NULL)
                 return 1; /* Failure: don't filter */
 
-	_Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two");
-	socklen &= sizeof(augmented_args->saddr) - 1;
+	_Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two");
+	socklen &= sizeof(augmented_args->arg.saddr) - 1;
 
-	bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
+	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
+	augmented_args->arg.size = socklen;
+	augmented_args->arg.err = 0;
 
 	return augmented__output(args, augmented_args, len + socklen);
 }
@@ -201,14 +203,14 @@ int sys_enter_sendto(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *sockaddr_arg = (const void *)args->args[4];
 	unsigned int socklen = args->args[5];
-	unsigned int len = sizeof(augmented_args->args);
+	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
 
         if (augmented_args == NULL)
                 return 1; /* Failure: don't filter */
 
-	socklen &= sizeof(augmented_args->saddr) - 1;
+	socklen &= sizeof(augmented_args->arg.saddr) - 1;
 
-	bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
+	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
 
 	return augmented__output(args, augmented_args, len + socklen);
 }
@@ -249,13 +251,23 @@ int sys_enter_rename(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[0],
 		   *newpath_arg = (const void *)args->args[1];
-	unsigned int len = sizeof(augmented_args->args), oldpath_len;
+	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
 
         if (augmented_args == NULL)
                 return 1; /* Failure: don't filter */
 
+	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
+
 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
-	len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
+	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
+	len += augmented_args->arg.size;
+
+	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
+
+	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
+	arg2->size = newpath_len;
+
+	len += newpath_len;
 
 	return augmented__output(args, augmented_args, len);
 }
@@ -266,13 +278,23 @@ int sys_enter_renameat2(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[1],
 		   *newpath_arg = (const void *)args->args[3];
-	unsigned int len = sizeof(augmented_args->args), oldpath_len;
+	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
 
         if (augmented_args == NULL)
                 return 1; /* Failure: don't filter */
 
+	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
+
 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
-	len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
+	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
+	len += augmented_args->arg.size;
+
+	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
+
+	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
+	arg2->size = newpath_len;
+
+	len += newpath_len;
 
 	return augmented__output(args, augmented_args, len);
 }
@@ -293,26 +315,26 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
-	unsigned int len = sizeof(augmented_args->args);
+	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
 
         if (augmented_args == NULL)
 		goto failure;
 
-	if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0)
+	if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0)
 		goto failure;
 
-	attr_read = (const struct perf_event_attr_size *)augmented_args->__data;
+	attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value;
 
 	__u32 size = attr_read->size;
 
 	if (!size)
 		size = PERF_ATTR_SIZE_VER0;
 
-	if (size > sizeof(augmented_args->__data))
+	if (size > sizeof(augmented_args->arg.value))
                 goto failure;
 
 	// Now that we read attr->size and tested it against the size limits, read it completely
-	if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0)
+	if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0)
 		goto failure;
 
 	return augmented__output(args, augmented_args, len + size);
@@ -325,16 +347,16 @@ int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *rqtp_arg = (const void *)args->args[2];
-	unsigned int len = sizeof(augmented_args->args);
+	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
 	__u32 size = sizeof(struct timespec64);
 
         if (augmented_args == NULL)
 		goto failure;
 
-	if (size > sizeof(augmented_args->__data))
+	if (size > sizeof(augmented_args->arg.value))
                 goto failure;
 
-	bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg);
+	bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg);
 
 	return augmented__output(args, augmented_args, len + size);
 failure:
@@ -352,10 +374,10 @@ int sys_enter_nanosleep(struct syscall_enter_args *args)
         if (augmented_args == NULL)
 		goto failure;
 
-	if (size > sizeof(augmented_args->__data))
+	if (size > sizeof(augmented_args->arg.value))
                 goto failure;
 
-	bpf_probe_read_user(&augmented_args->__data, size, req_arg);
+	bpf_probe_read_user(&augmented_args->arg.value, size, req_arg);
 
 	return augmented__output(args, augmented_args, len + size);
 failure:
-- 
cgit v1.2.3


From 2f2e439ba56f45b9d8aff300b39fa0bfa49ec2d8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Sep 2024 16:29:44 -0300
Subject: perf trace: Mark which syscall arguments go from user space to kernel
 space

We need to know where to collect it in the BPF augmenters, if in the
sys_enter hook or in the sys_exit hook.

Start with the SCA_FILENAME one, that is just from user to kernel space.

The alternative, better, but takes a bit more time than I have now, is
to use the __user information that is already in the syscall args and
encoded in BTF via a tag, do it later.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 53690bbb143e..64f0aefeba3b 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -102,6 +102,12 @@
 
 /*
  * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
+ *
+ * We have to explicitely mark the direction of the flow of data, if from the
+ * kernel to user space or the other way around, since the BPF collector we
+ * have so far copies only from user to kernel space, mark the arguments that
+ * go that direction, so that we don´t end up collecting the previous contents
+ * for syscall args that goes from kernel to user space.
  */
 struct syscall_arg_fmt {
 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
@@ -110,6 +116,7 @@ struct syscall_arg_fmt {
 	void	   *parm;
 	const char *name;
 	u16	   nr_entries; // for arrays
+	bool	   from_user;
 	bool	   show_zero;
 #ifdef HAVE_LIBBPF_SUPPORT
 	const struct btf_type *type;
@@ -851,6 +858,11 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 
 #define SCA_FILENAME syscall_arg__scnprintf_filename
 
+// 'argname' is just documentational at this point, to remove the previous comment with that info
+#define SCA_FILENAME_FROM_USER(argname) \
+	  { .scnprintf	= SCA_FILENAME, \
+	    .from_user	= true, }
+
 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 						struct syscall_arg *arg)
 {
@@ -1091,11 +1103,11 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
 	{ .name     = "faccessat",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
-		   [1] = { .scnprintf = SCA_FILENAME,	  /* pathname */ },
+		   [1] = SCA_FILENAME_FROM_USER(pathname),
 		   [2] = { .scnprintf = SCA_ACCMODE,	  /* mode */ }, }, },
 	{ .name     = "faccessat2",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
-		   [1] = { .scnprintf = SCA_FILENAME,	  /* pathname */ },
+		   [1] = SCA_FILENAME_FROM_USER(pathname),
 		   [2] = { .scnprintf = SCA_ACCMODE,	  /* mode */ },
 		   [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, },
 	{ .name	    = "fchmodat",
@@ -1117,7 +1129,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 		   [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
 	{ .name     = "fspick",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dfd */ },
-		   [1] = { .scnprintf = SCA_FILENAME,	  /* path */ },
+		   [1] = SCA_FILENAME_FROM_USER(path),
 		   [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
 	{ .name	    = "fstat", .alias = "newfstat", },
 	{ .name	    = "futex",
@@ -1181,20 +1193,20 @@ static const struct syscall_fmt syscall_fmts[] = {
 			   .parm      = &strarray__mmap_flags, },
 		   [5] = { .scnprintf = SCA_HEX,	/* offset */ }, }, },
 	{ .name	    = "mount",
-	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
+	  .arg = { [0] = SCA_FILENAME_FROM_USER(devname),
 		   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
 			   .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
 	{ .name	    = "move_mount",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* from_dfd */ },
-		   [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ },
+		   [1] = SCA_FILENAME_FROM_USER(pathname),
 		   [2] = { .scnprintf = SCA_FDAT,	/* to_dfd */ },
-		   [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ },
+		   [3] = SCA_FILENAME_FROM_USER(pathname),
 		   [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
 	{ .name	    = "mprotect",
 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
 		   [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, },
 	{ .name	    = "mq_unlink",
-	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
+	  .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, },
 	{ .name	    = "mremap",	    .hexret = true,
 	  .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
 	{ .name	    = "name_to_handle_at",
@@ -1203,7 +1215,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [0] = { .scnprintf = SCA_TIMESPEC,  /* req */ }, }, },
 	{ .name	    = "newfstatat", .alias = "fstatat",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
-		   [1] = { .scnprintf = SCA_FILENAME,	  /* pathname */ },
+		   [1] = SCA_FILENAME_FROM_USER(pathname),
 		   [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
 	{ .name	    = "open",
 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
@@ -1299,9 +1311,9 @@ static const struct syscall_fmt syscall_fmts[] = {
 		   [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } ,
 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
 	{ .name	    = "swapoff",
-	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
+	  .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
 	{ .name	    = "swapon",
-	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
+	  .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
 	{ .name	    = "symlinkat",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 	{ .name	    = "sync_file_range",
@@ -1311,11 +1323,11 @@ static const struct syscall_fmt syscall_fmts[] = {
 	{ .name	    = "tkill",
 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 	{ .name     = "umount2", .alias = "umount",
-	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
+	  .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, },
 	{ .name	    = "uname", .alias = "newuname", },
 	{ .name	    = "unlinkat",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dfd */ },
-		   [1] = { .scnprintf = SCA_FILENAME,	  /* pathname */ },
+		   [1] = SCA_FILENAME_FROM_USER(pathname),
 		   [2] = { .scnprintf = SCA_FS_AT_FLAGS,  /* flags */ }, }, },
 	{ .name	    = "utimensat",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
@@ -1903,9 +1915,10 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 
 		if (strcmp(field->type, "const char *") == 0 &&
 		    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
-		     strstr(field->name, "path") != NULL))
+		     strstr(field->name, "path") != NULL)) {
 			arg->scnprintf = SCA_FILENAME;
-		else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
+			arg->from_user = true;
+		} else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
 			arg->scnprintf = SCA_PTR;
 		else if (strcmp(field->type, "pid_t") == 0)
 			arg->scnprintf = SCA_PID;
-- 
cgit v1.2.3


From 690eda6508c23023586ab81b11cae86476754d0b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Sep 2024 16:57:22 -0300
Subject: perf trace: Introduce SCA_PERF_ATTR_FROM_USER() to set .from_user =
 true

Paving the way for the generic BPF BTF based syscall arg augmenter.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c                | 2 +-
 tools/perf/trace/beauty/perf_event_open.c | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 64f0aefeba3b..5f3146651684 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1226,7 +1226,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 	{ .name	    = "perf_event_open",
-	  .arg = { [0] = { .scnprintf = SCA_PERF_ATTR,  /* attr */ },
+	  .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr),
 		   [2] = { .scnprintf = SCA_INT,	/* cpu */ },
 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c
index 632237128640..9f1ed989c775 100644
--- a/tools/perf/trace/beauty/perf_event_open.c
+++ b/tools/perf/trace/beauty/perf_event_open.c
@@ -88,3 +88,7 @@ static size_t syscall_arg__scnprintf_perf_event_attr(char *bf, size_t size, stru
 }
 
 #define SCA_PERF_ATTR syscall_arg__scnprintf_perf_event_attr
+// 'argname' is just documentational at this point, to remove the previous comment with that info
+#define SCA_PERF_ATTR_FROM_USER(argname) \
+          { .scnprintf  = SCA_PERF_ATTR, \
+            .from_user  = true, }
-- 
cgit v1.2.3


From be14a71984e16b95ff725dbda19899868c5ec8a6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Sep 2024 16:57:22 -0300
Subject: perf trace: Introduce SCA_SOCKADDR_FROM_USER() to set .from_user =
 true

Paving the way for the generic BPF BTF based syscall arg augmenter.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c       | 6 +++---
 tools/perf/trace/beauty/beauty.h | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 5f3146651684..64c3f7f7a9aa 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1075,7 +1075,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 		   [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
 	{ .name	    = "bind",
 	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
-		   [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ },
+		   [1] = SCA_SOCKADDR_FROM_USER(umyaddr),
 		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
 	{ .name	    = "bpf",
 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
@@ -1095,7 +1095,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
 	{ .name	    = "connect",
 	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
-		   [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ },
+		   [1] = SCA_SOCKADDR_FROM_USER(servaddr),
 		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
 	{ .name	    = "epoll_ctl",
 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
@@ -1289,7 +1289,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 	{ .name	    = "sendto",
 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
-		   [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
+		   [4] = SCA_SOCKADDR_FROM_USER(addr), }, },
 	{ .name	    = "set_tid_address", .errpid = true, },
 	{ .name	    = "setitimer",
 	  .arg = { [0] = STRARRAY(which, itimers), }, },
diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h
index 3ed11e18ee2d..9c6bfb7d0ef1 100644
--- a/tools/perf/trace/beauty/beauty.h
+++ b/tools/perf/trace/beauty/beauty.h
@@ -229,6 +229,11 @@ size_t syscall_arg__scnprintf_renameat2_flags(char *bf, size_t size, struct sysc
 size_t syscall_arg__scnprintf_sockaddr(char *bf, size_t size, struct syscall_arg *arg);
 #define SCA_SOCKADDR syscall_arg__scnprintf_sockaddr
 
+// 'argname' is just documentational at this point, to remove the previous comment with that info
+#define SCA_SOCKADDR_FROM_USER(argname) \
+	  { .scnprintf	= SCA_SOCKADDR, \
+	    .from_user	= true, }
+
 size_t syscall_arg__scnprintf_socket_protocol(char *bf, size_t size, struct syscall_arg *arg);
 #define SCA_SK_PROTO syscall_arg__scnprintf_socket_protocol
 
-- 
cgit v1.2.3


From c790f2bafb7a17d97c49c17607fa2ff919891f51 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Sep 2024 16:57:22 -0300
Subject: perf trace: Introduce SCA_TIMESPEC_FROM_USER() to set .from_user =
 true

Paving the way for the generic BPF BTF based syscall arg augmenter.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c       | 4 ++--
 tools/perf/trace/beauty/beauty.h | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 64c3f7f7a9aa..2bdbb6813512 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1084,7 +1084,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 	{ .name     = "clock_gettime",
 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
 	{ .name	    = "clock_nanosleep",
-	  .arg = { [2] = { .scnprintf = SCA_TIMESPEC,  /* rqtp */ }, }, },
+	  .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, },
 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
@@ -1212,7 +1212,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 	{ .name	    = "name_to_handle_at",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 	{ .name	    = "nanosleep",
-	  .arg = { [0] = { .scnprintf = SCA_TIMESPEC,  /* req */ }, }, },
+	  .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, },
 	{ .name	    = "newfstatat", .alias = "fstatat",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
 		   [1] = SCA_FILENAME_FROM_USER(pathname),
diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h
index 9c6bfb7d0ef1..0a07ad158f87 100644
--- a/tools/perf/trace/beauty/beauty.h
+++ b/tools/perf/trace/beauty/beauty.h
@@ -255,6 +255,11 @@ size_t syscall_arg__scnprintf_sync_file_range_flags(char *bf, size_t size, struc
 size_t syscall_arg__scnprintf_timespec(char *bf, size_t size, struct syscall_arg *arg);
 #define SCA_TIMESPEC syscall_arg__scnprintf_timespec
 
+// 'argname' is just documentational at this point, to remove the previous comment with that info
+#define SCA_TIMESPEC_FROM_USER(argname) \
+	  { .scnprintf	= SCA_TIMESPEC, \
+	    .from_user	= true, }
+
 size_t open__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix);
 
 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
-- 
cgit v1.2.3


From 12707b9159e631af91c051fd102d469364b6ecc3 Mon Sep 17 00:00:00 2001
From: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Date: Sun, 8 Sep 2024 17:23:53 +0800
Subject: libbpf: Fixed getting wrong return address on arm64 architecture

ARM64 has a separate lr register to store the return address, so here
you only need to read the lr register to get the return address, no need
to dereference it again.

Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1725787433-77262-1-git-send-email-chengshuyi@linux.alibaba.com
---
 tools/lib/bpf/bpf_tracing.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 4eab132a963e..aa3b04f5542a 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -522,7 +522,7 @@ struct pt_regs;
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = (ctx)->link; })
 #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
 
-#elif defined(bpf_target_sparc)
+#elif defined(bpf_target_sparc) || defined(bpf_target_arm64)
 
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = PT_REGS_RET(ctx); })
 #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
-- 
cgit v1.2.3


From 4cdc0e4ce5e893bc92255f5f734d983012f2bc2e Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sun, 8 Sep 2024 22:00:09 +0800
Subject: bpftool: Fix undefined behavior caused by shifting into the sign bit

Replace shifts of '1' with '1U' in bitwise operations within
__show_dev_tc_bpf() to prevent undefined behavior caused by shifting
into the sign bit of a signed integer. By using '1U', the operations
are explicitly performed on unsigned integers, avoiding potential
integer overflow or sign-related issues.

Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240908140009.3149781-1-visitorckw@gmail.com
---
 tools/bpf/bpftool/net.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index 2a51f1c25732..3c50fc66bb4a 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -486,9 +486,9 @@ static void __show_dev_tc_bpf(const struct ip_devname_ifindex *dev,
 		if (prog_flags[i] || json_output) {
 			NET_START_ARRAY("prog_flags", "%s ");
 			for (j = 0; prog_flags[i] && j < 32; j++) {
-				if (!(prog_flags[i] & (1 << j)))
+				if (!(prog_flags[i] & (1U << j)))
 					continue;
-				NET_DUMP_UINT_ONLY(1 << j);
+				NET_DUMP_UINT_ONLY(1U << j);
 			}
 			NET_END_ARRAY("");
 		}
@@ -497,9 +497,9 @@ static void __show_dev_tc_bpf(const struct ip_devname_ifindex *dev,
 			if (link_flags[i] || json_output) {
 				NET_START_ARRAY("link_flags", "%s ");
 				for (j = 0; link_flags[i] && j < 32; j++) {
-					if (!(link_flags[i] & (1 << j)))
+					if (!(link_flags[i] & (1U << j)))
 						continue;
-					NET_DUMP_UINT_ONLY(1 << j);
+					NET_DUMP_UINT_ONLY(1U << j);
 				}
 				NET_END_ARRAY("");
 			}
-- 
cgit v1.2.3


From f028d7716cdeae3be7d8d211482248916b25b1c2 Mon Sep 17 00:00:00 2001
From: Andrew Kreimer <algonell@gmail.com>
Date: Mon, 9 Sep 2024 12:24:41 +0300
Subject: bpftool: Fix typos

Fix typos in documentation.

Reported-by: Matthew Wilcox <willy@infradead.org>
Reported-by: Quentin Monnet <qmo@kernel.org>
Signed-off-by: Andrew Kreimer <algonell@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240909092452.4293-1-algonell@gmail.com
---
 tools/bpf/bpftool/Documentation/bpftool-gen.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index 1b426e58a7cd..ca860fd97d8d 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -172,7 +172,7 @@ bpftool gen min_core_btf *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...]
     CO-RE based application, turning the application portable to different
     kernel versions.
 
-    Check examples bellow for more information how to use it.
+    Check examples below for more information on how to use it.
 
 bpftool gen help
     Print short help message.
-- 
cgit v1.2.3


From bee109b7b3e50739b88252a219fa07ecd78ad628 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maxtram95@gmail.com>
Date: Mon, 9 Sep 2024 16:39:09 +0300
Subject: bpf: Fix error message on kfunc arg type mismatch

When "arg#%d expected pointer to ctx, but got %s" error is printed, both
template parts actually point to the type of the argument, therefore, it
will also say "but got PTR", regardless of what was the actual register
type.

Fix the message to print the register type in the second part of the
template, change the existing test to adapt to the new format, and add a
new test to test the case when arg is a pointer to context, but reg is a
scalar.

Fixes: 00b85860feb8 ("bpf: Rewrite kfunc argument handling")
Signed-off-by: Maxim Mikityanskiy <maxim@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20240909133909.1315460-1-maxim@isovalent.com
---
 kernel/bpf/verifier.c                               | 3 ++-
 tools/testing/selftests/bpf/prog_tests/kfunc_call.c | 1 +
 tools/testing/selftests/bpf/progs/kfunc_call_fail.c | 7 +++++++
 tools/testing/selftests/bpf/verifier/calls.c        | 2 +-
 4 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 53d0556fbbf3..f35b80c16cda 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12132,7 +12132,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		switch (kf_arg_type) {
 		case KF_ARG_PTR_TO_CTX:
 			if (reg->type != PTR_TO_CTX) {
-				verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+				verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
+					i, reg_type_str(env, reg->type));
 				return -EINVAL;
 			}
 
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index 5b743212292f..f79c8e53cb3e 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -68,6 +68,7 @@ static struct kfunc_test_params kfunc_tests[] = {
 	TC_FAIL(kfunc_call_test_get_mem_fail_oob, 0, "min value is outside of the allowed memory range"),
 	TC_FAIL(kfunc_call_test_get_mem_fail_not_const, 0, "is not a const"),
 	TC_FAIL(kfunc_call_test_mem_acquire_fail, 0, "acquire kernel function does not return PTR_TO_BTF_ID"),
+	TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "arg#0 expected pointer to ctx, but got scalar"),
 
 	/* success cases */
 	TC_TEST(kfunc_call_test1, 12),
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
index 4b0b7b79cdfb..08fae306539c 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
@@ -150,4 +150,11 @@ int kfunc_call_test_mem_acquire_fail(struct __sk_buff *skb)
 	return ret;
 }
 
+SEC("?tc")
+int kfunc_call_test_pointer_arg_type_mismatch(struct __sk_buff *skb)
+{
+	bpf_kfunc_call_test_pass_ctx((void *)10);
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index d0cdd156cd55..7afc2619ab14 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -76,7 +76,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "arg#0 expected pointer to ctx, but got PTR",
+	.errstr = "arg#0 expected pointer to ctx, but got fp",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_pass_ctx", 2 },
 	},
-- 
cgit v1.2.3


From 41d0c4677feee1ea063e0f2c2af72dc953b1f1cc Mon Sep 17 00:00:00 2001
From: Yusheng Zheng <yunwei356@gmail.com>
Date: Mon, 9 Sep 2024 22:59:52 +0000
Subject: libbpf: Fix some typos in comments

Fix some spelling errors in the code comments of libbpf:

betwen -> between
paremeters -> parameters
knowning -> knowing
definiton -> definition
compatiblity -> compatibility
overriden -> overridden
occured -> occurred
proccess -> process
managment -> management
nessary -> necessary

Signed-off-by: Yusheng Zheng <yunwei356@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240909225952.30324-1-yunwei356@gmail.com
---
 tools/lib/bpf/bpf_helpers.h   |  2 +-
 tools/lib/bpf/bpf_tracing.h   |  2 +-
 tools/lib/bpf/btf.c           |  2 +-
 tools/lib/bpf/btf.h           |  2 +-
 tools/lib/bpf/btf_dump.c      |  2 +-
 tools/lib/bpf/libbpf.h        | 10 +++++-----
 tools/lib/bpf/libbpf_legacy.h |  4 ++--
 tools/lib/bpf/skel_internal.h |  2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 305c62817dd3..80bc0242e8dc 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -341,7 +341,7 @@ extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym;
  * I.e., it looks almost like high-level for each loop in other languages,
  * supports continue/break, and is verifiable by BPF verifier.
  *
- * For iterating integers, the difference betwen bpf_for_each(num, i, N, M)
+ * For iterating integers, the difference between bpf_for_each(num, i, N, M)
  * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to
  * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int
  * *`, not just `int`. So for integers bpf_for() is more convenient.
diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index aa3b04f5542a..a8f6cd4841b0 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -808,7 +808,7 @@ struct pt_regs;
  * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific
  * low-level way of getting kprobe input arguments from struct pt_regs, and
  * provides a familiar typed and named function arguments syntax and
- * semantics of accessing kprobe input paremeters.
+ * semantics of accessing kprobe input parameters.
  *
  * Original struct pt_regs* context is preserved as 'ctx' argument. This might
  * be necessary when using BPF helpers like bpf_perf_event_output().
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 8d51e73d55a8..3c131039c523 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -4230,7 +4230,7 @@ static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id
  * consists of portions of the graph that come from multiple compilation units.
  * This is due to the fact that types within single compilation unit are always
  * deduplicated and FWDs are already resolved, if referenced struct/union
- * definiton is available. So, if we had unresolved FWD and found corresponding
+ * definition is available. So, if we had unresolved FWD and found corresponding
  * STRUCT/UNION, they will be from different compilation units. This
  * consequently means that when we "link" FWD to corresponding STRUCT/UNION,
  * type graph will likely have at least two different BTF types that describe
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index b68d216837a9..4e349ad79ee6 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -286,7 +286,7 @@ LIBBPF_API void btf_dump__free(struct btf_dump *d);
 LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id);
 
 struct btf_dump_emit_type_decl_opts {
-	/* size of this struct, for forward/backward compatiblity */
+	/* size of this struct, for forward/backward compatibility */
 	size_t sz;
 	/* optional field name for type declaration, e.g.:
 	 * - struct my_struct <FNAME>
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 894860111ddb..0a7327541c17 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -304,7 +304,7 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id)
  * definition, in which case they have to be declared inline as part of field
  * type declaration; or as a top-level anonymous enum, typically used for
  * declaring global constants. It's impossible to distinguish between two
- * without knowning whether given enum type was referenced from other type:
+ * without knowing whether given enum type was referenced from other type:
  * top-level anonymous enum won't be referenced by anything, while embedded
  * one will.
  */
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 64a6a3d323e3..6917653ef9fa 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -152,7 +152,7 @@ struct bpf_object_open_opts {
 	 * log_buf and log_level settings.
 	 *
 	 * If specified, this log buffer will be passed for:
-	 *   - each BPF progral load (BPF_PROG_LOAD) attempt, unless overriden
+	 *   - each BPF progral load (BPF_PROG_LOAD) attempt, unless overridden
 	 *     with bpf_program__set_log() on per-program level, to get
 	 *     BPF verifier log output.
 	 *   - during BPF object's BTF load into kernel (BPF_BTF_LOAD) to get
@@ -455,7 +455,7 @@ LIBBPF_API int bpf_link__destroy(struct bpf_link *link);
 /**
  * @brief **bpf_program__attach()** is a generic function for attaching
  * a BPF program based on auto-detection of program type, attach type,
- * and extra paremeters, where applicable.
+ * and extra parameters, where applicable.
  *
  * @param prog BPF program to attach
  * @return Reference to the newly created BPF link; or NULL is returned on error,
@@ -679,7 +679,7 @@ struct bpf_uprobe_opts {
 /**
  * @brief **bpf_program__attach_uprobe()** attaches a BPF program
  * to the userspace function which is found by binary path and
- * offset. You can optionally specify a particular proccess to attach
+ * offset. You can optionally specify a particular process to attach
  * to. You can also optionally attach the program to the function
  * exit instead of entry.
  *
@@ -1593,11 +1593,11 @@ LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_i
  * memory region of the ring buffer.
  * This ring buffer can be used to implement a custom events consumer.
  * The ring buffer starts with the *struct perf_event_mmap_page*, which
- * holds the ring buffer managment fields, when accessing the header
+ * holds the ring buffer management fields, when accessing the header
  * structure it's important to be SMP aware.
  * You can refer to *perf_event_read_simple* for a simple example.
  * @param pb the perf buffer structure
- * @param buf_idx the buffer index to retreive
+ * @param buf_idx the buffer index to retrieve
  * @param buf (out) gets the base pointer of the mmap()'ed memory
  * @param buf_size (out) gets the size of the mmap()'ed region
  * @return 0 on success, negative error code for failure
diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h
index 1e1be467bede..60b2600be88a 100644
--- a/tools/lib/bpf/libbpf_legacy.h
+++ b/tools/lib/bpf/libbpf_legacy.h
@@ -76,7 +76,7 @@ enum libbpf_strict_mode {
 	 * first BPF program or map creation operation. This is done only if
 	 * kernel is too old to support memcg-based memory accounting for BPF
 	 * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY,
-	 * but it can be overriden with libbpf_set_memlock_rlim() API.
+	 * but it can be overridden with libbpf_set_memlock_rlim() API.
 	 * Note that libbpf_set_memlock_rlim() needs to be called before
 	 * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load()
 	 * operation.
@@ -97,7 +97,7 @@ LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode);
  * @brief **libbpf_get_error()** extracts the error code from the passed
  * pointer
  * @param ptr pointer returned from libbpf API function
- * @return error code; or 0 if no error occured
+ * @return error code; or 0 if no error occurred
  *
  * Note, as of libbpf 1.0 this function is not necessary and not recommended
  * to be used. Libbpf doesn't return error code embedded into the pointer
diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h
index 1e82ab06c3eb..0875452521e9 100644
--- a/tools/lib/bpf/skel_internal.h
+++ b/tools/lib/bpf/skel_internal.h
@@ -107,7 +107,7 @@ static inline void skel_free(const void *p)
  * The loader program will perform probe_read_kernel() from maps.rodata.initial_value.
  * skel_finalize_map_data() sets skel->rodata to point to actual value in a bpf map and
  * does maps.rodata.initial_value = ~0ULL to signal skel_free_map_data() that kvfree
- * is not nessary.
+ * is not necessary.
  *
  * For user space:
  * skel_prep_map_data() mmaps anon memory into skel->rodata that can be accessed directly.
-- 
cgit v1.2.3


From 391e86971161906e720f5d3c110ca9124c14fb55 Mon Sep 17 00:00:00 2001
From: Alexander Zhu <alexlzhu@fb.com>
Date: Fri, 30 Aug 2024 11:03:37 +0100
Subject: mm: selftest to verify zero-filled pages are mapped to zeropage

When a THP is split, any subpage that is zero-filled will be mapped to the
shared zeropage, hence saving memory.  Add selftest to verify this by
allocating zero-filled THP and comparing RssAnon before and after split.

Link: https://lkml.kernel.org/r/20240830100438.3623486-4-usamaarif642@gmail.com
Signed-off-by: Alexander Zhu <alexlzhu@fb.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Acked-by: Rik van Riel <riel@surriel.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Shuang Zhai <zhais@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Shuang Zhai <szhai2@cs.rochester.edu>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/split_huge_page_test.c | 71 +++++++++++++++++++++++
 tools/testing/selftests/mm/vm_util.c              | 22 +++++++
 tools/testing/selftests/mm/vm_util.h              |  1 +
 3 files changed, 94 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index e5e8dafc9d94..eb6d1b9fc362 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -84,6 +84,76 @@ static void write_debugfs(const char *fmt, ...)
 	write_file(SPLIT_DEBUGFS, input, ret + 1);
 }
 
+static char *allocate_zero_filled_hugepage(size_t len)
+{
+	char *result;
+	size_t i;
+
+	result = memalign(pmd_pagesize, len);
+	if (!result) {
+		printf("Fail to allocate memory\n");
+		exit(EXIT_FAILURE);
+	}
+
+	madvise(result, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		result[i] = (char)0;
+
+	return result;
+}
+
+static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len)
+{
+	unsigned long rss_anon_before, rss_anon_after;
+	size_t i;
+
+	if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
+		printf("No THP is allocated\n");
+		exit(EXIT_FAILURE);
+	}
+
+	rss_anon_before = rss_anon();
+	if (!rss_anon_before) {
+		printf("No RssAnon is allocated before split\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* split all THPs */
+	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+		      (uint64_t)one_page + len, 0);
+
+	for (i = 0; i < len; i++)
+		if (one_page[i] != (char)0) {
+			printf("%ld byte corrupted\n", i);
+			exit(EXIT_FAILURE);
+		}
+
+	if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
+		printf("Still AnonHugePages not split\n");
+		exit(EXIT_FAILURE);
+	}
+
+	rss_anon_after = rss_anon();
+	if (rss_anon_after >= rss_anon_before) {
+		printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
+		       rss_anon_before, rss_anon_after);
+		exit(EXIT_FAILURE);
+	}
+}
+
+void split_pmd_zero_pages(void)
+{
+	char *one_page;
+	int nr_hpages = 4;
+	size_t len = nr_hpages * pmd_pagesize;
+
+	one_page = allocate_zero_filled_hugepage(len);
+	verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len);
+	printf("Split zero filled huge pages successful\n");
+	free(one_page);
+}
+
 void split_pmd_thp(void)
 {
 	char *one_page;
@@ -431,6 +501,7 @@ int main(int argc, char **argv)
 
 	fd_size = 2 * pmd_pagesize;
 
+	split_pmd_zero_pages();
 	split_pmd_thp();
 	split_pte_mapped_thp();
 	split_file_backed_thp();
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 5a62530da3b5..d8d0cf04bb57 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -12,6 +12,7 @@
 
 #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
 #define SMAP_FILE_PATH "/proc/self/smaps"
+#define STATUS_FILE_PATH "/proc/self/status"
 #define MAX_LINE_LENGTH 500
 
 unsigned int __page_size;
@@ -171,6 +172,27 @@ uint64_t read_pmd_pagesize(void)
 	return strtoul(buf, NULL, 10);
 }
 
+unsigned long rss_anon(void)
+{
+	unsigned long rss_anon = 0;
+	FILE *fp;
+	char buffer[MAX_LINE_LENGTH];
+
+	fp = fopen(STATUS_FILE_PATH, "r");
+	if (!fp)
+		ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
+
+	if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer)))
+		goto err_out;
+
+	if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1)
+		ksft_exit_fail_msg("Reading status error\n");
+
+err_out:
+	fclose(fp);
+	return rss_anon;
+}
+
 bool __check_huge(void *addr, char *pattern, int nr_hpages,
 		  uint64_t hpage_size)
 {
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 9007c420d52c..2eaed8209925 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start);
 void clear_softdirty(void);
 bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
 uint64_t read_pmd_pagesize(void);
+unsigned long rss_anon(void);
 bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
 bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
 bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
-- 
cgit v1.2.3


From 536ab838a5b37b6ae3f8d53552560b7c51daeb41 Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Fri, 30 Aug 2024 10:46:09 +0530
Subject: selftests/mm: relax test to fail after 100 migration failures

It was recently observed at [1] that during the folio unmapping stage of
migration, when the PTEs are cleared, a racing thread faulting on that
folio may increase the refcount of the folio, sleep on the folio lock (the
migration path has the lock), and migration ultimately fails when
asserting the actual refcount against the expected.  Thereby, the
migration selftest fails on shared-anon mappings.  The above enforces the
fact that migration is a best-effort service, therefore, it is wrong to
fail the test for just a single failure; hence, fail the test after 100
consecutive failures (where 100 is still a subjective choice).  Note that,
this has no effect on the execution time of the test since that is
controlled by a timeout.

[1] https://lore.kernel.org/all/20240801081657.1386743-1-dev.jain@arm.com/

Link: https://lkml.kernel.org/r/20240830051609.4037834-1-dev.jain@arm.com
Signed-off-by: Dev Jain <dev.jain@arm.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <yang@os.amperecomputing.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/migration.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c
index 6908569ef406..64bcbb7151cf 100644
--- a/tools/testing/selftests/mm/migration.c
+++ b/tools/testing/selftests/mm/migration.c
@@ -15,10 +15,10 @@
 #include <signal.h>
 #include <time.h>
 
-#define TWOMEG (2<<20)
-#define RUNTIME (20)
-
-#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+#define TWOMEG		(2<<20)
+#define RUNTIME		(20)
+#define MAX_RETRIES	100
+#define ALIGN(x, a)	(((x) + (a - 1)) & (~((a) - 1)))
 
 FIXTURE(migration)
 {
@@ -65,6 +65,7 @@ int migrate(uint64_t *ptr, int n1, int n2)
 	int ret, tmp;
 	int status = 0;
 	struct timespec ts1, ts2;
+	int failures = 0;
 
 	if (clock_gettime(CLOCK_MONOTONIC, &ts1))
 		return -1;
@@ -79,13 +80,17 @@ int migrate(uint64_t *ptr, int n1, int n2)
 		ret = move_pages(0, 1, (void **) &ptr, &n2, &status,
 				MPOL_MF_MOVE_ALL);
 		if (ret) {
-			if (ret > 0)
+			if (ret > 0) {
+				/* Migration is best effort; try again */
+				if (++failures < MAX_RETRIES)
+					continue;
 				printf("Didn't migrate %d pages\n", ret);
+			}
 			else
 				perror("Couldn't migrate pages");
 			return -2;
 		}
-
+		failures = 0;
 		tmp = n2;
 		n2 = n1;
 		n1 = tmp;
-- 
cgit v1.2.3


From a7e387375f22a4fd46c8ffcfa395a0ca8c186f9e Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Fri, 6 Sep 2024 00:00:35 +0800
Subject: selftests: return failure when timestamps can't be reported

When I was trying to modify the tx timestamping feature, I found that
running "./txtimestamp -4 -C -L 127.0.0.1" didn't reflect the error:
I succeeded to generate timestamp stored in the skb but later failed
to report it to the userspace (which means failed to put css into cmsg).
It can happen when someone writes buggy codes in __sock_recv_timestamp(),
for example.

After adding the check so that running ./txtimestamp will reflect the
result correctly like this if there is a bug in the reporting phase:
protocol:     TCP
payload:      10
server port:  9000

family:       INET
test SND
    USR: 1725458477 s 667997 us (seq=0, len=0)
Failed to report timestamps
    USR: 1725458477 s 718128 us (seq=0, len=0)
Failed to report timestamps
    USR: 1725458477 s 768273 us (seq=0, len=0)
Failed to report timestamps
    USR: 1725458477 s 818416 us (seq=0, len=0)
Failed to report timestamps
...

In the future, it will help us detect whether the new coming patch has
bugs or not.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240905160035.62407-1-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/txtimestamp.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c
index ec60a16c9307..d626f22f9550 100644
--- a/tools/testing/selftests/net/txtimestamp.c
+++ b/tools/testing/selftests/net/txtimestamp.c
@@ -356,8 +356,12 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len)
 		}
 	}
 
-	if (batch > 1)
+	if (batch > 1) {
 		fprintf(stderr, "batched %d timestamps\n", batch);
+	} else if (!batch) {
+		fprintf(stderr, "Failed to report timestamps\n");
+		test_failed = true;
+	}
 }
 
 static int recv_errmsg(int fd)
-- 
cgit v1.2.3


From 4c30f5ce4f7af4f639af99e0bdeada8b268b7361 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Sep 2024 13:42:47 -1000
Subject: sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()

Once a task is put into a DSQ, the allowed operations are fairly limited.
Tasks in the built-in local and global DSQs are executed automatically and,
ignoring dequeue, there is only one way a task in a user DSQ can be
manipulated - scx_bpf_consume() moves the first task to the dispatching
local DSQ. This inflexibility sometimes gets in the way and is an area where
multiple feature requests have been made.

Implement scx_bpf_dispatch[_vtime]_from_dsq(), which can be called during
DSQ iteration and can move the task to any DSQ - local DSQs, global DSQ and
user DSQs. The kfuncs can be called from ops.dispatch() and any BPF context
which dosen't hold a rq lock including BPF timers and SYSCALL programs.

This is an expansion of an earlier patch which only allowed moving into the
dispatching local DSQ:

  http://lkml.kernel.org/r/Zn4Cw4FDTmvXnhaf@slm.duckdns.org

v2: Remove @slice and @vtime from scx_bpf_dispatch_from_dsq[_vtime]() as
    they push scx_bpf_dispatch_from_dsq_vtime() over the kfunc argument
    count limit and often won't be needed anyway. Instead provide
    scx_bpf_dispatch_from_dsq_set_{slice|vtime}() kfuncs which can be called
    only when needed and override the specified parameter for the subsequent
    dispatch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Hodges <hodges.daniel.scott@gmail.com>
Cc: David Vernet <void@manifault.com>
Cc: Changwoo Min <multics69@gmail.com>
Cc: Andrea Righi <andrea.righi@linux.dev>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 kernel/sched/ext.c                       | 232 ++++++++++++++++++++++++++++++-
 tools/sched_ext/include/scx/common.bpf.h |  10 ++
 2 files changed, 239 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 44a2d1b9e837..b6e8c224f894 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1158,6 +1158,11 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
 	return true;
 }
 
+static bool scx_kf_allowed_if_unlocked(void)
+{
+	return !current->scx.kf_mask;
+}
+
 /**
  * nldsq_next_task - Iterate to the next task in a non-local DSQ
  * @dsq: user dsq being interated
@@ -1211,13 +1216,20 @@ enum scx_dsq_iter_flags {
 	/* iterate in the reverse dispatch order */
 	SCX_DSQ_ITER_REV		= 1U << 16,
 
+	__SCX_DSQ_ITER_HAS_SLICE	= 1U << 30,
+	__SCX_DSQ_ITER_HAS_VTIME	= 1U << 31,
+
 	__SCX_DSQ_ITER_USER_FLAGS	= SCX_DSQ_ITER_REV,
-	__SCX_DSQ_ITER_ALL_FLAGS	= __SCX_DSQ_ITER_USER_FLAGS,
+	__SCX_DSQ_ITER_ALL_FLAGS	= __SCX_DSQ_ITER_USER_FLAGS |
+					  __SCX_DSQ_ITER_HAS_SLICE |
+					  __SCX_DSQ_ITER_HAS_VTIME,
 };
 
 struct bpf_iter_scx_dsq_kern {
 	struct scx_dsq_list_node	cursor;
 	struct scx_dispatch_q		*dsq;
+	u64				slice;
+	u64				vtime;
 } __attribute__((aligned(8)));
 
 struct bpf_iter_scx_dsq {
@@ -5872,7 +5884,7 @@ __bpf_kfunc_start_defs();
  * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
  * @p: task_struct to dispatch
  * @dsq_id: DSQ to dispatch to
- * @slice: duration @p can run for in nsecs
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
  * @enq_flags: SCX_ENQ_*
  *
  * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
@@ -5922,7 +5934,7 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
  * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
  * @p: task_struct to dispatch
  * @dsq_id: DSQ to dispatch to
- * @slice: duration @p can run for in nsecs
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
  * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
  * @enq_flags: SCX_ENQ_*
  *
@@ -5963,6 +5975,118 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
 	.set			= &scx_kfunc_ids_enqueue_dispatch,
 };
 
+static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
+				  struct task_struct *p, u64 dsq_id,
+				  u64 enq_flags)
+{
+	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
+	struct rq *this_rq, *src_rq, *dst_rq, *locked_rq;
+	bool dispatched = false;
+	bool in_balance;
+	unsigned long flags;
+
+	if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
+		return false;
+
+	/*
+	 * Can be called from either ops.dispatch() locking this_rq() or any
+	 * context where no rq lock is held. If latter, lock @p's task_rq which
+	 * we'll likely need anyway.
+	 */
+	src_rq = task_rq(p);
+
+	local_irq_save(flags);
+	this_rq = this_rq();
+	in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
+
+	if (in_balance) {
+		if (this_rq != src_rq) {
+			raw_spin_rq_unlock(this_rq);
+			raw_spin_rq_lock(src_rq);
+		}
+	} else {
+		raw_spin_rq_lock(src_rq);
+	}
+
+	locked_rq = src_rq;
+	raw_spin_lock(&src_dsq->lock);
+
+	/*
+	 * Did someone else get to it? @p could have already left $src_dsq, got
+	 * re-enqueud, or be in the process of being consumed by someone else.
+	 */
+	if (unlikely(p->scx.dsq != src_dsq ||
+		     u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
+		     p->scx.holding_cpu >= 0) ||
+	    WARN_ON_ONCE(src_rq != task_rq(p))) {
+		raw_spin_unlock(&src_dsq->lock);
+		goto out;
+	}
+
+	/* @p is still on $src_dsq and stable, determine the destination */
+	dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+
+	if (dst_dsq->id == SCX_DSQ_LOCAL) {
+		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+		if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+			dst_dsq = &scx_dsq_global;
+			dst_rq = src_rq;
+		}
+	} else {
+		/* no need to migrate if destination is a non-local DSQ */
+		dst_rq = src_rq;
+	}
+
+	/*
+	 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
+	 * CPU, @p will be migrated.
+	 */
+	if (dst_dsq->id == SCX_DSQ_LOCAL) {
+		/* @p is going from a non-local DSQ to a local DSQ */
+		if (src_rq == dst_rq) {
+			task_unlink_from_dsq(p, src_dsq);
+			move_local_task_to_local_dsq(p, enq_flags,
+						     src_dsq, dst_rq);
+			raw_spin_unlock(&src_dsq->lock);
+		} else {
+			raw_spin_unlock(&src_dsq->lock);
+			move_remote_task_to_local_dsq(p, enq_flags,
+						      src_rq, dst_rq);
+			locked_rq = dst_rq;
+		}
+	} else {
+		/*
+		 * @p is going from a non-local DSQ to a non-local DSQ. As
+		 * $src_dsq is already locked, do an abbreviated dequeue.
+		 */
+		task_unlink_from_dsq(p, src_dsq);
+		p->scx.dsq = NULL;
+		raw_spin_unlock(&src_dsq->lock);
+
+		if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
+			p->scx.dsq_vtime = kit->vtime;
+		dispatch_enqueue(dst_dsq, p, enq_flags);
+	}
+
+	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
+		p->scx.slice = kit->slice;
+
+	dispatched = true;
+out:
+	if (in_balance) {
+		if (this_rq != locked_rq) {
+			raw_spin_rq_unlock(locked_rq);
+			raw_spin_rq_lock(this_rq);
+		}
+	} else {
+		raw_spin_rq_unlock_irqrestore(locked_rq, flags);
+	}
+
+	kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
+			       __SCX_DSQ_ITER_HAS_VTIME);
+	return dispatched;
+}
+
 __bpf_kfunc_start_defs();
 
 /**
@@ -6042,12 +6166,112 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
 	}
 }
 
+/**
+ * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
+ * @it__iter: DSQ iterator in progress
+ * @slice: duration the dispatched task can run for in nsecs
+ *
+ * Override the slice of the next task that will be dispatched from @it__iter
+ * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called,
+ * the previous slice duration is kept.
+ */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
+				struct bpf_iter_scx_dsq *it__iter, u64 slice)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
+
+	kit->slice = slice;
+	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
+}
+
+/**
+ * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ
+ * @it__iter: DSQ iterator in progress
+ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
+ *
+ * Override the vtime of the next task that will be dispatched from @it__iter
+ * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the
+ * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to
+ * dispatch the next task, the override is ignored and cleared.
+ */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
+				struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
+
+	kit->vtime = vtime;
+	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
+}
+
+/**
+ * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ
+ * @it__iter: DSQ iterator in progress
+ * @p: task to transfer
+ * @dsq_id: DSQ to move @p to
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
+ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
+ * be the destination.
+ *
+ * For the transfer to be successful, @p must still be on the DSQ and have been
+ * queued before the DSQ iteration started. This function doesn't care whether
+ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
+ * been queued before the iteration started.
+ *
+ * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to
+ * update.
+ *
+ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
+ * lock (e.g. BPF timers or SYSCALL programs).
+ *
+ * Returns %true if @p has been consumed, %false if @p had already been consumed
+ * or dequeued.
+ */
+__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
+					   struct task_struct *p, u64 dsq_id,
+					   u64 enq_flags)
+{
+	return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
+				     p, dsq_id, enq_flags);
+}
+
+/**
+ * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ
+ * @it__iter: DSQ iterator in progress
+ * @p: task to transfer
+ * @dsq_id: DSQ to move @p to
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the
+ * priority queue of the DSQ specified by @dsq_id. The destination must be a
+ * user DSQ as only user DSQs support priority queue.
+ *
+ * @p's slice and vtime are kept by default. Use
+ * scx_bpf_dispatch_from_dsq_set_slice() and
+ * scx_bpf_dispatch_from_dsq_set_vtime() to update.
+ *
+ * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
+ * scx_bpf_dispatch_vtime() for more information on @vtime.
+ */
+__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
+						 struct task_struct *p, u64 dsq_id,
+						 u64 enq_flags)
+{
+	return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
+				     p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
 BTF_ID_FLAGS(func, scx_bpf_consume)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -6144,6 +6368,8 @@ __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 457462b19966..f538c75db183 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -35,6 +35,10 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
+void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
+void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
@@ -63,6 +67,12 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
 
+/*
+ * Use the following as @it__iter when calling
+ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
+ */
+#define BPF_FOR_EACH_ITER	(&___it)
+
 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
 
-- 
cgit v1.2.3


From 2d285d561543d76b4a13263731b447e646c258f2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Sep 2024 13:42:47 -1000
Subject: scx_qmap: Implement highpri boosting

Implement a silly boosting mechanism for nice -20 tasks. The only purpose is
demonstrating and testing scx_bpf_dispatch_from_dsq(). The boosting only
works within SHARED_DSQ and makes only minor differences with increased
dispatch batch (-b).

This exercises moving tasks to a user DSQ and all local DSQs from
ops.dispatch() and BPF timerfn.

v2: - Updated to use scx_bpf_dispatch_from_dsq_set_{slice|vtime}().

    - Drop the workaround for the iterated tasks not being trusted by the
      verifier. The issue is fixed from BPF side.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Hodges <hodges.daniel.scott@gmail.com>
Cc: David Vernet <void@manifault.com>
Cc: Changwoo Min <multics69@gmail.com>
Cc: Andrea Righi <andrea.righi@linux.dev>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 tools/sched_ext/scx_qmap.bpf.c | 133 +++++++++++++++++++++++++++++++++++++----
 tools/sched_ext/scx_qmap.c     |  11 +++-
 2 files changed, 130 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 7e7f28f4117e..83c8f54c1e31 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -27,6 +27,8 @@
 enum consts {
 	ONE_SEC_IN_NS		= 1000000000,
 	SHARED_DSQ		= 0,
+	HIGHPRI_DSQ		= 1,
+	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
 };
 
 char _license[] SEC("license") = "GPL";
@@ -36,10 +38,12 @@ const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
 const volatile u32 dsp_batch;
+const volatile bool highpri_boosting;
 const volatile bool print_shared_dsq;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
 
+u64 nr_highpri_queued;
 u32 test_error_cnt;
 
 UEI_DEFINE(uei);
@@ -95,6 +99,7 @@ static u64 core_sched_tail_seqs[5];
 /* Per-task scheduling context */
 struct task_ctx {
 	bool	force_local;	/* Dispatch directly to local_dsq */
+	bool	highpri;
 	u64	core_sched_seq;
 };
 
@@ -122,6 +127,7 @@ struct {
 /* Statistics */
 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
 u64 nr_core_sched_execed;
+u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
 
@@ -140,17 +146,25 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 	return -1;
 }
 
+static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+{
+	struct task_ctx *tctx;
+
+	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+		scx_bpf_error("task_ctx lookup failed");
+		return NULL;
+	}
+	return tctx;
+}
+
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
 	struct task_ctx *tctx;
 	s32 cpu;
 
-	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!tctx) {
-		scx_bpf_error("task_ctx lookup failed");
+	if (!(tctx = lookup_task_ctx(p)))
 		return -ESRCH;
-	}
 
 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
 
@@ -197,11 +211,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");
 
-	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!tctx) {
-		scx_bpf_error("task_ctx lookup failed");
+	if (!(tctx = lookup_task_ctx(p)))
 		return;
-	}
 
 	/*
 	 * All enqueued tasks must have their core_sched_seq updated for correct
@@ -255,6 +266,10 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
+		tctx->highpri = true;
+		__sync_fetch_and_add(&nr_highpri_queued, 1);
+	}
 	__sync_fetch_and_add(&nr_enqueued, 1);
 }
 
@@ -271,13 +286,80 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 
 static void update_core_sched_head_seq(struct task_struct *p)
 {
-	struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
 	int idx = weight_to_idx(p->scx.weight);
+	struct task_ctx *tctx;
 
-	if (tctx)
+	if ((tctx = lookup_task_ctx(p)))
 		core_sched_head_seqs[idx] = tctx->core_sched_seq;
-	else
-		scx_bpf_error("task_ctx lookup failed");
+}
+
+/*
+ * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
+ * selective priority boosting mechanism by scanning SHARED_DSQ looking for
+ * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
+ * makes minor difference only when dsp_batch is larger than 1.
+ *
+ * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
+ * non-rq-lock holding BPF programs. As demonstration, this function is called
+ * from qmap_dispatch() and monitor_timerfn().
+ */
+static bool dispatch_highpri(bool from_timer)
+{
+	struct task_struct *p;
+	s32 this_cpu = bpf_get_smp_processor_id();
+
+	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
+	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
+		static u64 highpri_seq;
+		struct task_ctx *tctx;
+
+		if (!(tctx = lookup_task_ctx(p)))
+			return false;
+
+		if (tctx->highpri) {
+			/* exercise the set_*() and vtime interface too */
+			scx_bpf_dispatch_from_dsq_set_slice(
+				BPF_FOR_EACH_ITER, slice_ns * 2);
+			scx_bpf_dispatch_from_dsq_set_vtime(
+				BPF_FOR_EACH_ITER, highpri_seq++);
+			scx_bpf_dispatch_vtime_from_dsq(
+				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
+		}
+	}
+
+	/*
+	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
+	 * is found.
+	 */
+	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
+		bool dispatched = false;
+		s32 cpu;
+
+		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
+			cpu = this_cpu;
+		else
+			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
+
+		if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
+					      SCX_DSQ_LOCAL_ON | cpu,
+					      SCX_ENQ_PREEMPT)) {
+			if (cpu == this_cpu) {
+				dispatched = true;
+				__sync_fetch_and_add(&nr_expedited_local, 1);
+			} else {
+				__sync_fetch_and_add(&nr_expedited_remote, 1);
+			}
+			if (from_timer)
+				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
+		} else {
+			__sync_fetch_and_add(&nr_expedited_lost, 1);
+		}
+
+		if (dispatched)
+			return true;
+	}
+
+	return false;
 }
 
 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
@@ -289,7 +371,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	void *fifo;
 	s32 i, pid;
 
-	if (scx_bpf_consume(SHARED_DSQ))
+	if (dispatch_highpri(false))
+		return;
+
+	if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
 		return;
 
 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -326,6 +411,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
+			struct task_ctx *tctx;
+
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;
 
@@ -333,13 +420,25 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!p)
 				continue;
 
+			if (!(tctx = lookup_task_ctx(p))) {
+				bpf_task_release(p);
+				return;
+			}
+
+			if (tctx->highpri)
+				__sync_fetch_and_sub(&nr_highpri_queued, 1);
+
 			update_core_sched_head_seq(p);
 			__sync_fetch_and_add(&nr_dispatched, 1);
+
 			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
 			bpf_task_release(p);
+
 			batch--;
 			cpuc->dsp_cnt--;
 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
+				if (dispatch_highpri(false))
+					return;
 				scx_bpf_consume(SHARED_DSQ);
 				return;
 			}
@@ -664,6 +763,10 @@ static void dump_shared_dsq(void)
 
 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
+	bpf_rcu_read_lock();
+	dispatch_highpri(true);
+	bpf_rcu_read_unlock();
+
 	monitor_cpuperf();
 
 	if (print_shared_dsq)
@@ -685,6 +788,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 	if (ret)
 		return ret;
 
+	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
+	if (ret)
+		return ret;
+
 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
 	if (!timer)
 		return -ESRCH;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index c9ca30d62b2b..ac45a02b4055 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -29,6 +29,7 @@ const char help_fmt[] =
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
+"  -H            Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -S            Suppress qmap-specific debug dump\n"
@@ -63,7 +64,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -86,6 +87,9 @@ int main(int argc, char **argv)
 		case 'P':
 			skel->rodata->print_shared_dsq = true;
 			break;
+		case 'H':
+			skel->rodata->highpri_boosting = true;
+			break;
 		case 'd':
 			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 			if (skel->rodata->disallow_tgid < 0)
@@ -121,6 +125,11 @@ int main(int argc, char **argv)
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
 		       skel->bss->nr_core_sched_execed,
 		       skel->bss->nr_ddsp_from_enq);
+		printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
+		       skel->bss->nr_expedited_local,
+		       skel->bss->nr_expedited_remote,
+		       skel->bss->nr_expedited_from_timer,
+		       skel->bss->nr_expedited_lost);
 		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
 			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
 			       skel->bss->cpuperf_min,
-- 
cgit v1.2.3


From 0aa75a2b3fafccc875d260e190b14faf5a856d45 Mon Sep 17 00:00:00 2001
From: zhangjiao <zhangjiao2@cmss.chinamobile.com>
Date: Thu, 29 Aug 2024 12:20:08 +0800
Subject: tools/mm: rm thp_swap_allocator_test when make clean

rm thp_swap_allocator_test when make clean

Link: https://lkml.kernel.org/r/20240829042008.6937-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: zhangjiao <zhangjiao2@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/mm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index 15791c1c5b28..f5725b5c23aa 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -23,7 +23,7 @@ $(LIBS):
 	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
 
 clean:
-	$(RM) page-types slabinfo page_owner_sort
+	$(RM) page-types slabinfo page_owner_sort thp_swap_allocator_test
 	make -C $(LIB_DIR) clean
 
 sbindir ?= /usr/sbin
-- 
cgit v1.2.3


From f58817c852e9c9eb8116c24d8271a35159636605 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Fri, 6 Sep 2024 20:46:07 +0200
Subject: selftests: mptcp: lib: add time per subtests in TAP output

It adds 'time=<N>ms' in the diagnostic data of the TAP output, e.g.

  ok 1 - pm_netlink: defaults addr list # time=9ms

This addition is useful to quickly identify which subtests are taking a
longer time than the others, or more than expected.

Note that there are no specific formats to follow to show this time
according to the TAP 13 [1], TAP 14 [2] and KTAP [3] specifications.
Let's then define this one here.

Link: https://testanything.org/tap-version-13-specification.html [1]
Link: https://testanything.org/tap-version-14-specification.html [2]
Link: https://docs.kernel.org/dev-tools/ktap.html [3]
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-1-31d5ee4f3bdf@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_lib.sh | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
index 4578a331041e..975d4d4c862a 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -29,6 +29,7 @@ declare -rx MPTCP_LIB_AF_INET6=10
 MPTCP_LIB_SUBTESTS=()
 MPTCP_LIB_SUBTESTS_DUPLICATED=0
 MPTCP_LIB_SUBTEST_FLAKY=0
+MPTCP_LIB_SUBTESTS_LAST_TS_MS=
 MPTCP_LIB_TEST_COUNTER=0
 MPTCP_LIB_TEST_FORMAT="%02u %-50s"
 MPTCP_LIB_IP_MPTCP=0
@@ -205,6 +206,11 @@ mptcp_lib_kversion_ge() {
 	mptcp_lib_fail_if_expected_feature "kernel version ${1} lower than ${v}"
 }
 
+mptcp_lib_subtests_last_ts_reset() {
+	MPTCP_LIB_SUBTESTS_LAST_TS_MS="$(date +%s%3N)"
+}
+mptcp_lib_subtests_last_ts_reset
+
 __mptcp_lib_result_check_duplicated() {
 	local subtest
 
@@ -219,13 +225,22 @@ __mptcp_lib_result_check_duplicated() {
 
 __mptcp_lib_result_add() {
 	local result="${1}"
+	local time="time="
+	local ts_prev_ms
 	shift
 
 	local id=$((${#MPTCP_LIB_SUBTESTS[@]} + 1))
 
 	__mptcp_lib_result_check_duplicated "${*}"
 
-	MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*}")
+	# not to add two '#'
+	[[ "${*}" != *"#"* ]] && time="# ${time}"
+
+	ts_prev_ms="${MPTCP_LIB_SUBTESTS_LAST_TS_MS}"
+	mptcp_lib_subtests_last_ts_reset
+	time+="$((MPTCP_LIB_SUBTESTS_LAST_TS_MS - ts_prev_ms))ms"
+
+	MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*} ${time}")
 }
 
 # $1: test name
-- 
cgit v1.2.3


From 1a38cee4bbd0adeb62e11aab429678b1fb4a12ae Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Fri, 6 Sep 2024 20:46:08 +0200
Subject: selftests: mptcp: connect: remote time in TAP output

It is now added by the MPTCP lib automatically, see the parent commit.

The time in the TAP output might be slightly different from the one
displayed before, but that's OK.

Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-2-31d5ee4f3bdf@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.sh | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index b77fb7065bfb..f61e2f5870ea 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -431,7 +431,6 @@ do_transfer()
 
 	local duration
 	duration=$((stop-start))
-	result_msg+=" # time=${duration}ms"
 	printf "(duration %05sms) " "${duration}"
 	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
 		mptcp_lib_pr_fail "client exit code $retc, server $rets"
-- 
cgit v1.2.3


From d4e192728efc532bc9a93e664f81fac602786450 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Fri, 6 Sep 2024 20:46:09 +0200
Subject: selftests: mptcp: reset the last TS before the first test

Just to slightly improve the precision of the duration of the first
test.

In mptcp_join.sh, the last append_prev_results is now done as soon as
the last test is over: this will add the last result in the list, and
get a more precise time for this last test.

Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-3-31d5ee4f3bdf@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 ++
 tools/testing/selftests/net/mptcp/mptcp_join.sh    | 3 ++-
 tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 1 +
 tools/testing/selftests/net/mptcp/pm_netlink.sh    | 2 ++
 tools/testing/selftests/net/mptcp/simult_flows.sh  | 1 +
 tools/testing/selftests/net/mptcp/userspace_pm.sh  | 1 +
 6 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index f61e2f5870ea..49d90c4dbc01 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -847,6 +847,8 @@ stop_if_error()
 make_file "$cin" "client"
 make_file "$sin" "server"
 
+mptcp_lib_subtests_last_ts_reset
+
 check_mptcp_disabled
 
 stop_if_error "The kernel configuration is not valid for MPTCP"
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 43f8a9bd84c4..3564cd06643c 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3959,9 +3959,11 @@ if [ ${#tests[@]} -eq 0 ]; then
 	tests=("${all_tests_names[@]}")
 fi
 
+mptcp_lib_subtests_last_ts_reset
 for subtests in "${tests[@]}"; do
 	"${subtests}"
 done
+append_prev_results
 
 if [ ${ret} -ne 0 ]; then
 	echo
@@ -3972,7 +3974,6 @@ if [ ${ret} -ne 0 ]; then
 	echo
 fi
 
-append_prev_results
 mptcp_lib_result_print_all_tap
 
 exit $ret
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index 68899a303a1a..5e8d5b83e2d0 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -349,6 +349,7 @@ init
 make_file "$cin" "client" 1
 make_file "$sin" "server" 1
 trap cleanup EXIT
+mptcp_lib_subtests_last_ts_reset
 
 run_tests $ns1 $ns2 10.0.1.1
 run_tests $ns1 $ns2 dead:beef:1::1
diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh
index 2757378b1b13..2e6648a2b2c0 100755
--- a/tools/testing/selftests/net/mptcp/pm_netlink.sh
+++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh
@@ -137,6 +137,8 @@ check()
 	fi
 }
 
+mptcp_lib_subtests_last_ts_reset
+
 check "show_endpoints" "" "defaults addr list"
 
 default_limits="$(get_limits)"
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index f74e1c3c126d..8fa77c8e9b65 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -286,6 +286,7 @@ while getopts "bcdhi" option;do
 done
 
 setup
+mptcp_lib_subtests_last_ts_reset
 run_test 10 10 0 0 "balanced bwidth"
 run_test 10 10 1 25 "balanced bwidth with unbalanced delay"
 
diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh
index 9cb05978269d..3651f73451cf 100755
--- a/tools/testing/selftests/net/mptcp/userspace_pm.sh
+++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh
@@ -150,6 +150,7 @@ mptcp_lib_events "${ns2}" "${client_evts}" client_evts_pid
 server_evts=$(mktemp)
 mptcp_lib_events "${ns1}" "${server_evts}" server_evts_pid
 sleep 0.5
+mptcp_lib_subtests_last_ts_reset
 
 print_title "Init"
 print_test "Created network namespaces ns1, ns2"
-- 
cgit v1.2.3


From a5b6be42aac0fd3b94adaeec3cda4d847b304fd8 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Fri, 6 Sep 2024 20:46:10 +0200
Subject: selftests: mptcp: diag: remove trailing whitespace

It doesn't need to be there, and it can cause some issues with TAP
parsers expecting only one space around the directive delimiter (#).

Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-4-31d5ee4f3bdf@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/diag.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh
index 776d43a6922d..2bd0c1eb70c5 100755
--- a/tools/testing/selftests/net/mptcp/diag.sh
+++ b/tools/testing/selftests/net/mptcp/diag.sh
@@ -284,7 +284,7 @@ echo "b" | \
 			./mptcp_connect -p 10000 -r 0 -t ${timeout_poll} -w 20 \
 				127.0.0.1 >/dev/null &
 wait_connected $ns 10000
-chk_msk_nr 2 "after MPC handshake "
+chk_msk_nr 2 "after MPC handshake"
 chk_last_time_info 10000
 chk_msk_remote_key_nr 2 "....chk remote_key"
 chk_msk_fallback_nr 0 "....chk no fallback"
-- 
cgit v1.2.3


From a92d1db0c989d4244d3a671d78f291606ed4ce35 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Fri, 6 Sep 2024 20:46:11 +0200
Subject: selftests: mptcp: connect: remove duplicated spaces in TAP output

It is nice to have a visual alignment in the test output to present the
different results, but it makes less sense in the TAP output that is
there for computers.

It sounds then better to remove the duplicated whitespaces in the TAP
output, also because it can cause some issues with TAP parsers expecting
only one space around the directive delimiter (#).

While at it, change the variable name (result_msg) to something more
explicit.

Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-5-31d5ee4f3bdf@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.sh | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 49d90c4dbc01..57325d57e4c6 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -345,9 +345,11 @@ do_transfer()
 
 	local addr_port
 	addr_port=$(printf "%s:%d" ${connect_addr} ${port})
-	local result_msg
-	result_msg="$(printf "%.3s %-5s -> %.3s (%-20s) %-5s" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto})"
-	mptcp_lib_print_title "${result_msg}"
+	local pretty_title
+	pretty_title="$(printf "%.3s %-5s -> %.3s (%-20s) %-5s" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto})"
+	mptcp_lib_print_title "${pretty_title}"
+
+	local tap_title="${connector_ns:0:3} ${cl_proto} -> ${listener_ns:0:3} (${addr_port}) ${srv_proto}"
 
 	if $capture; then
 		local capuser
@@ -443,7 +445,7 @@ do_transfer()
 
 		echo
 		cat "$capout"
-		mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}"
+		mptcp_lib_result_fail "${TEST_GROUP}: ${tap_title}"
 		return 1
 	fi
 
@@ -543,12 +545,12 @@ do_transfer()
 
 	if [ $retc -eq 0 ] && [ $rets -eq 0 ]; then
 		mptcp_lib_pr_ok "${extra:1}"
-		mptcp_lib_result_pass "${TEST_GROUP}: ${result_msg}"
+		mptcp_lib_result_pass "${TEST_GROUP}: ${tap_title}"
 	else
 		if [ -n "${extra}" ]; then
 			mptcp_lib_print_warn "${extra:1}"
 		fi
-		mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}"
+		mptcp_lib_result_fail "${TEST_GROUP}: ${tap_title}"
 	fi
 
 	cat "$capout"
-- 
cgit v1.2.3


From 5aa57d9f2d5311f19434d95b2a81610aa263e23b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 5 Sep 2024 12:32:40 -0700
Subject: af_unix: Don't return OOB skb in manage_oob().

syzbot reported use-after-free in unix_stream_recv_urg(). [0]

The scenario is

  1. send(MSG_OOB)
  2. recv(MSG_OOB)
     -> The consumed OOB remains in recv queue
  3. send(MSG_OOB)
  4. recv()
     -> manage_oob() returns the next skb of the consumed OOB
     -> This is also OOB, but unix_sk(sk)->oob_skb is not cleared
  5. recv(MSG_OOB)
     -> unix_sk(sk)->oob_skb is used but already freed

The recent commit 8594d9b85c07 ("af_unix: Don't call skb_get() for OOB
skb.") uncovered the issue.

If the OOB skb is consumed and the next skb is peeked in manage_oob(),
we still need to check if the skb is OOB.

Let's do so by falling back to the following checks in manage_oob()
and add the test case in selftest.

Note that we need to add a similar check for SIOCATMARK.

[0]:
BUG: KASAN: slab-use-after-free in unix_stream_read_actor+0xa6/0xb0 net/unix/af_unix.c:2959
Read of size 4 at addr ffff8880326abcc4 by task syz-executor178/5235

CPU: 0 UID: 0 PID: 5235 Comm: syz-executor178 Not tainted 6.11.0-rc5-syzkaller-00742-gfbdaffe41adc #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:93 [inline]
 dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119
 print_address_description mm/kasan/report.c:377 [inline]
 print_report+0x169/0x550 mm/kasan/report.c:488
 kasan_report+0x143/0x180 mm/kasan/report.c:601
 unix_stream_read_actor+0xa6/0xb0 net/unix/af_unix.c:2959
 unix_stream_recv_urg+0x1df/0x320 net/unix/af_unix.c:2640
 unix_stream_read_generic+0x2456/0x2520 net/unix/af_unix.c:2778
 unix_stream_recvmsg+0x22b/0x2c0 net/unix/af_unix.c:2996
 sock_recvmsg_nosec net/socket.c:1046 [inline]
 sock_recvmsg+0x22f/0x280 net/socket.c:1068
 ____sys_recvmsg+0x1db/0x470 net/socket.c:2816
 ___sys_recvmsg net/socket.c:2858 [inline]
 __sys_recvmsg+0x2f0/0x3e0 net/socket.c:2888
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f5360d6b4e9
Code: 48 83 c4 28 c3 e8 37 17 00 00 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fff29b3a458 EFLAGS: 00000246 ORIG_RAX: 000000000000002f
RAX: ffffffffffffffda RBX: 00007fff29b3a638 RCX: 00007f5360d6b4e9
RDX: 0000000000002001 RSI: 0000000020000640 RDI: 0000000000000003
RBP: 00007f5360dde610 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
R13: 00007fff29b3a628 R14: 0000000000000001 R15: 0000000000000001
 </TASK>

Allocated by task 5235:
 kasan_save_stack mm/kasan/common.c:47 [inline]
 kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
 unpoison_slab_object mm/kasan/common.c:312 [inline]
 __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338
 kasan_slab_alloc include/linux/kasan.h:201 [inline]
 slab_post_alloc_hook mm/slub.c:3988 [inline]
 slab_alloc_node mm/slub.c:4037 [inline]
 kmem_cache_alloc_node_noprof+0x16b/0x320 mm/slub.c:4080
 __alloc_skb+0x1c3/0x440 net/core/skbuff.c:667
 alloc_skb include/linux/skbuff.h:1320 [inline]
 alloc_skb_with_frags+0xc3/0x770 net/core/skbuff.c:6528
 sock_alloc_send_pskb+0x91a/0xa60 net/core/sock.c:2815
 sock_alloc_send_skb include/net/sock.h:1778 [inline]
 queue_oob+0x108/0x680 net/unix/af_unix.c:2198
 unix_stream_sendmsg+0xd24/0xf80 net/unix/af_unix.c:2351
 sock_sendmsg_nosec net/socket.c:730 [inline]
 __sock_sendmsg+0x221/0x270 net/socket.c:745
 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597
 ___sys_sendmsg net/socket.c:2651 [inline]
 __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2680
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Freed by task 5235:
 kasan_save_stack mm/kasan/common.c:47 [inline]
 kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579
 poison_slab_object+0xe0/0x150 mm/kasan/common.c:240
 __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256
 kasan_slab_free include/linux/kasan.h:184 [inline]
 slab_free_hook mm/slub.c:2252 [inline]
 slab_free mm/slub.c:4473 [inline]
 kmem_cache_free+0x145/0x350 mm/slub.c:4548
 unix_stream_read_generic+0x1ef6/0x2520 net/unix/af_unix.c:2917
 unix_stream_recvmsg+0x22b/0x2c0 net/unix/af_unix.c:2996
 sock_recvmsg_nosec net/socket.c:1046 [inline]
 sock_recvmsg+0x22f/0x280 net/socket.c:1068
 __sys_recvfrom+0x256/0x3e0 net/socket.c:2255
 __do_sys_recvfrom net/socket.c:2273 [inline]
 __se_sys_recvfrom net/socket.c:2269 [inline]
 __x64_sys_recvfrom+0xde/0x100 net/socket.c:2269
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

The buggy address belongs to the object at ffff8880326abc80
 which belongs to the cache skbuff_head_cache of size 240
The buggy address is located 68 bytes inside of
 freed 240-byte region [ffff8880326abc80, ffff8880326abd70)

The buggy address belongs to the physical page:
page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x326ab
ksm flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff)
page_type: 0xfdffffff(slab)
raw: 00fff00000000000 ffff88801eaee780 ffffea0000b7dc80 dead000000000003
raw: 0000000000000000 00000000800c000c 00000001fdffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 0, migratetype Unmovable, gfp_mask 0x52cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP), pid 4686, tgid 4686 (udevadm), ts 32357469485, free_ts 28829011109
 set_page_owner include/linux/page_owner.h:32 [inline]
 post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1493
 prep_new_page mm/page_alloc.c:1501 [inline]
 get_page_from_freelist+0x2e4c/0x2f10 mm/page_alloc.c:3439
 __alloc_pages_noprof+0x256/0x6c0 mm/page_alloc.c:4695
 __alloc_pages_node_noprof include/linux/gfp.h:269 [inline]
 alloc_pages_node_noprof include/linux/gfp.h:296 [inline]
 alloc_slab_page+0x5f/0x120 mm/slub.c:2321
 allocate_slab+0x5a/0x2f0 mm/slub.c:2484
 new_slab mm/slub.c:2537 [inline]
 ___slab_alloc+0xcd1/0x14b0 mm/slub.c:3723
 __slab_alloc+0x58/0xa0 mm/slub.c:3813
 __slab_alloc_node mm/slub.c:3866 [inline]
 slab_alloc_node mm/slub.c:4025 [inline]
 kmem_cache_alloc_node_noprof+0x1fe/0x320 mm/slub.c:4080
 __alloc_skb+0x1c3/0x440 net/core/skbuff.c:667
 alloc_skb include/linux/skbuff.h:1320 [inline]
 alloc_uevent_skb+0x74/0x230 lib/kobject_uevent.c:289
 uevent_net_broadcast_untagged lib/kobject_uevent.c:326 [inline]
 kobject_uevent_net_broadcast+0x2fd/0x580 lib/kobject_uevent.c:410
 kobject_uevent_env+0x57d/0x8e0 lib/kobject_uevent.c:608
 kobject_synth_uevent+0x4ef/0xae0 lib/kobject_uevent.c:207
 uevent_store+0x4b/0x70 drivers/base/bus.c:633
 kernfs_fop_write_iter+0x3a1/0x500 fs/kernfs/file.c:334
 new_sync_write fs/read_write.c:497 [inline]
 vfs_write+0xa72/0xc90 fs/read_write.c:590
page last free pid 1 tgid 1 stack trace:
 reset_page_owner include/linux/page_owner.h:25 [inline]
 free_pages_prepare mm/page_alloc.c:1094 [inline]
 free_unref_page+0xd22/0xea0 mm/page_alloc.c:2612
 kasan_depopulate_vmalloc_pte+0x74/0x90 mm/kasan/shadow.c:408
 apply_to_pte_range mm/memory.c:2797 [inline]
 apply_to_pmd_range mm/memory.c:2841 [inline]
 apply_to_pud_range mm/memory.c:2877 [inline]
 apply_to_p4d_range mm/memory.c:2913 [inline]
 __apply_to_page_range+0x8a8/0xe50 mm/memory.c:2947
 kasan_release_vmalloc+0x9a/0xb0 mm/kasan/shadow.c:525
 purge_vmap_node+0x3e3/0x770 mm/vmalloc.c:2208
 __purge_vmap_area_lazy+0x708/0xae0 mm/vmalloc.c:2290
 _vm_unmap_aliases+0x79d/0x840 mm/vmalloc.c:2885
 change_page_attr_set_clr+0x2fe/0xdb0 arch/x86/mm/pat/set_memory.c:1881
 change_page_attr_set arch/x86/mm/pat/set_memory.c:1922 [inline]
 set_memory_nx+0xf2/0x130 arch/x86/mm/pat/set_memory.c:2110
 free_init_pages arch/x86/mm/init.c:924 [inline]
 free_kernel_image_pages arch/x86/mm/init.c:943 [inline]
 free_initmem+0x79/0x110 arch/x86/mm/init.c:970
 kernel_init+0x31/0x2b0 init/main.c:1476
 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

Memory state around the buggy address:
 ffff8880326abb80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8880326abc00: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc
>ffff8880326abc80: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff8880326abd00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc
 ffff8880326abd80: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb

Fixes: 93c99f21db36 ("af_unix: Don't stop recv(MSG_DONTWAIT) if consumed OOB skb is at the head.")
Reported-by: syzbot+8811381d455e3e9ec788@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=8811381d455e3e9ec788
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20240905193240.17565-5-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/unix/af_unix.c                            |  9 +++++++--
 tools/testing/selftests/net/af_unix/msg_oob.c | 23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 159d78fc3d14..001ccc55ef0f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2673,7 +2673,8 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
 			__skb_unlink(read_skb, &sk->sk_receive_queue);
 		}
 
-		goto unlock;
+		if (!skb)
+			goto unlock;
 	}
 
 	if (skb != u->oob_skb)
@@ -3175,9 +3176,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			skb = skb_peek(&sk->sk_receive_queue);
 			if (skb) {
 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
+				struct sk_buff *next_skb;
+
+				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
 
 				if (skb == oob_skb ||
-				    (!oob_skb && !unix_skb_len(skb)))
+				    (!unix_skb_len(skb) &&
+				     (!oob_skb || next_skb == oob_skb)))
 					answ = 1;
 			}
 
diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c
index 535eb2c3d7d1..3ed3882a93b8 100644
--- a/tools/testing/selftests/net/af_unix/msg_oob.c
+++ b/tools/testing/selftests/net/af_unix/msg_oob.c
@@ -525,6 +525,29 @@ TEST_F(msg_oob, ex_oob_drop_2)
 	}
 }
 
+TEST_F(msg_oob, ex_oob_oob)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("x", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+
+	sendpair("y", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("", -EAGAIN, 1, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	recvpair("", -EINVAL, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
 TEST_F(msg_oob, ex_oob_ahead_break)
 {
 	sendpair("hello", 5, MSG_OOB);
-- 
cgit v1.2.3


From dbd61921a6adbd231d8ca91537f1e7b62e9fea61 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 5 Sep 2024 19:15:51 -0400
Subject: selftests: support interpreted scripts with ksft_runner.sh

Support testcases that are themselves not executable, but need an
interpreter to run them.

If a test file is not executable, but an executable file
ksft_runner.sh exists in the TARGET dir, kselftest will run

    ./ksft_runner.sh ./$BASENAME_TEST

Packetdrill may add hundreds of packetdrill scripts for testing. These
scripts must be passed to the packetdrill process.

Have kselftest run each test directly, as it already solves common
runner requirements like parallel execution and isolation (netns).
A previous RFC added a wrapper in between, which would have to
reimplement such functionality.

Link: https://lore.kernel.org/netdev/66d4d97a4cac_3df182941a@willemb.c.googlers.com.notmuch/T/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240905231653.2427327-2-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/kselftest/runner.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
index 74954f6a8f94..2c3c58e65a41 100644
--- a/tools/testing/selftests/kselftest/runner.sh
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -111,8 +111,11 @@ run_one()
 			stdbuf="/usr/bin/stdbuf --output=L "
 		fi
 		eval kselftest_cmd_args="\$${kselftest_cmd_args_ref:-}"
-		cmd="$stdbuf ./$BASENAME_TEST $kselftest_cmd_args"
-		if [ ! -x "$TEST" ]; then
+		if [ -x "$TEST" ]; then
+			cmd="$stdbuf ./$BASENAME_TEST $kselftest_cmd_args"
+		elif [ -x "./ksft_runner.sh" ]; then
+			cmd="$stdbuf ./ksft_runner.sh ./$BASENAME_TEST"
+		else
 			echo "# Warning: file $TEST is not executable"
 
 			if [ $(head -n 1 "$TEST" | cut -c -2) = "#!" ]
-- 
cgit v1.2.3


From 8a405552fd3b1eefe186e724343e88790f6be832 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 5 Sep 2024 19:15:52 -0400
Subject: selftests/net: integrate packetdrill with ksft

Lay the groundwork to import into kselftests the over 150 packetdrill
TCP/IP conformance tests on github.com/google/packetdrill.

Florian recently added support for packetdrill tests in nf_conntrack,
in commit a8a388c2aae49 ("selftests: netfilter: add packetdrill based
conntrack tests").

This patch takes a slightly different approach. It relies on
ksft_runner.sh to run every *.pkt file in the directory.

Any future imports of packetdrill tests should require no additional
coding. Just add the *.pkt files.

Initially import only two features/directories from github. One with a
single script, and one with two. This was the only reason to pick
tcp/inq and tcp/md5.

The path replaces the directory hierarchy in github with a flat space
of files: $(subst /,_,$(wildcard tcp/**/*.pkt)). This is the most
straightforward option to integrate with kselftests. The Linked thread
reviewed two ways to maintain the hierarchy: TEST_PROGS_RECURSE and
PRESERVE_TEST_DIRS. But both introduce significant changes to
kselftest infra and with that risk to existing tests.

Implementation notes:
- restore alphabetical order when adding the new directory to
  tools/testing/selftests/Makefile
- imported *.pkt files and support verbatim from the github project,
  except for
    - update `source ./defaults.sh` path (to adjust for flat dir)
    - add SPDX headers
    - remove one author statement
    - Acknowledgment: drop an e (checkpatch)

Tested:
	make -C tools/testing/selftests \
	  TARGETS=net/packetdrill \
	  run_tests

	make -C tools/testing/selftests \
	  TARGETS=net/packetdrill \
	  install INSTALL_PATH=$KSFT_INSTALL_PATH

	# in virtme-ng
	./run_kselftest.sh -c net/packetdrill
	./run_kselftest.sh -t net/packetdrill:tcp_inq_client.pkt

Link: https://lore.kernel.org/netdev/20240827193417.2792223-1-willemdebruijn.kernel@gmail.com/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240905231653.2427327-3-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/Makefile                   |  5 +-
 tools/testing/selftests/net/packetdrill/Makefile   |  9 ++++
 tools/testing/selftests/net/packetdrill/config     |  5 ++
 .../testing/selftests/net/packetdrill/defaults.sh  | 63 ++++++++++++++++++++++
 .../selftests/net/packetdrill/ksft_runner.sh       | 41 ++++++++++++++
 .../selftests/net/packetdrill/tcp_inq_client.pkt   | 51 ++++++++++++++++++
 .../selftests/net/packetdrill/tcp_inq_server.pkt   | 51 ++++++++++++++++++
 .../packetdrill/tcp_md5_md5-only-on-client-ack.pkt | 28 ++++++++++
 8 files changed, 251 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/net/packetdrill/Makefile
 create mode 100644 tools/testing/selftests/net/packetdrill/config
 create mode 100755 tools/testing/selftests/net/packetdrill/defaults.sh
 create mode 100755 tools/testing/selftests/net/packetdrill/ksft_runner.sh
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index a5f1c0c27dff..3b7df5477317 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -65,10 +65,11 @@ TARGETS += net/af_unix
 TARGETS += net/forwarding
 TARGETS += net/hsr
 TARGETS += net/mptcp
-TARGETS += net/openvswitch
-TARGETS += net/tcp_ao
 TARGETS += net/netfilter
+TARGETS += net/openvswitch
+TARGETS += net/packetdrill
 TARGETS += net/rds
+TARGETS += net/tcp_ao
 TARGETS += nsfs
 TARGETS += perf_events
 TARGETS += pidfd
diff --git a/tools/testing/selftests/net/packetdrill/Makefile b/tools/testing/selftests/net/packetdrill/Makefile
new file mode 100644
index 000000000000..870f7258dc8d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_INCLUDES := ksft_runner.sh \
+		 defaults.sh \
+		 ../../kselftest/ktap_helpers.sh
+
+TEST_PROGS := $(wildcard *.pkt)
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/packetdrill/config b/tools/testing/selftests/net/packetdrill/config
new file mode 100644
index 000000000000..0d402830f18d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/config
@@ -0,0 +1,5 @@
+CONFIG_IPV6=y
+CONFIG_NET_SCH_FIFO=y
+CONFIG_PROC_SYSCTL=y
+CONFIG_TCP_MD5SIG=y
+CONFIG_TUN=y
diff --git a/tools/testing/selftests/net/packetdrill/defaults.sh b/tools/testing/selftests/net/packetdrill/defaults.sh
new file mode 100755
index 000000000000..1095a7b22f44
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/defaults.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Set standard production config values that relate to TCP behavior.
+
+# Flush old cached data (fastopen cookies).
+ip tcp_metrics flush all > /dev/null 2>&1
+
+# TCP min, default, and max receive and send buffer sizes.
+sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))"
+sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304"
+
+# TCP timestamps.
+sysctl -q net.ipv4.tcp_timestamps=1
+
+# TCP SYN(ACK) retry thresholds
+sysctl -q net.ipv4.tcp_syn_retries=5
+sysctl -q net.ipv4.tcp_synack_retries=5
+
+# TCP Forward RTO-Recovery, RFC 5682.
+sysctl -q net.ipv4.tcp_frto=2
+
+# TCP Selective Acknowledgements (SACK)
+sysctl -q net.ipv4.tcp_sack=1
+
+# TCP Duplicate Selective Acknowledgements (DSACK)
+sysctl -q net.ipv4.tcp_dsack=1
+
+# TCP FACK (Forward Acknowldgement)
+sysctl -q net.ipv4.tcp_fack=0
+
+# TCP reordering degree ("dupthresh" threshold for entering Fast Recovery).
+sysctl -q net.ipv4.tcp_reordering=3
+
+# TCP congestion control.
+sysctl -q net.ipv4.tcp_congestion_control=cubic
+
+# TCP slow start after idle.
+sysctl -q net.ipv4.tcp_slow_start_after_idle=0
+
+# TCP RACK and TLP.
+sysctl -q net.ipv4.tcp_early_retrans=4 net.ipv4.tcp_recovery=1
+
+# TCP method for deciding when to defer sending to accumulate big TSO packets.
+sysctl -q net.ipv4.tcp_tso_win_divisor=3
+
+# TCP Explicit Congestion Notification (ECN)
+sysctl -q net.ipv4.tcp_ecn=0
+
+sysctl -q net.ipv4.tcp_pacing_ss_ratio=200
+sysctl -q net.ipv4.tcp_pacing_ca_ratio=120
+sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1
+
+sysctl -q net.ipv4.tcp_fastopen=0x70403
+sysctl -q net.ipv4.tcp_fastopen_key=a1a1a1a1-b2b2b2b2-c3c3c3c3-d4d4d4d4
+
+sysctl -q net.ipv4.tcp_syncookies=1
+
+# Override the default qdisc on the tun device.
+# Many tests fail with timing errors if the default
+# is FQ and that paces their flows.
+tc qdisc add dev tun0 root pfifo
+
diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
new file mode 100755
index 000000000000..2f62caccbbbc
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source "$(dirname $(realpath $0))/../../kselftest/ktap_helpers.sh"
+
+readonly ipv4_args=('--ip_version=ipv4 '
+		    '--local_ip=192.168.0.1 '
+		    '--gateway_ip=192.168.0.1 '
+		    '--netmask_ip=255.255.0.0 '
+		    '--remote_ip=192.0.2.1 '
+		    '-D CMSG_LEVEL_IP=SOL_IP '
+		    '-D CMSG_TYPE_RECVERR=IP_RECVERR ')
+
+readonly ipv6_args=('--ip_version=ipv6 '
+		    '--mtu=1520 '
+		    '--local_ip=fd3d:0a0b:17d6::1 '
+		    '--gateway_ip=fd3d:0a0b:17d6:8888::1 '
+		    '--remote_ip=fd3d:fa7b:d17d::1 '
+		    '-D CMSG_LEVEL_IP=SOL_IPV6 '
+		    '-D CMSG_TYPE_RECVERR=IPV6_RECVERR ')
+
+if [ $# -ne 1 ]; then
+	ktap_exit_fail_msg "usage: $0 <script>"
+	exit "$KSFT_FAIL"
+fi
+script="$1"
+
+if [ -z "$(which packetdrill)" ]; then
+	ktap_skip_all "packetdrill not found in PATH"
+	exit "$KSFT_SKIP"
+fi
+
+ktap_print_header
+ktap_set_plan 2
+
+packetdrill ${ipv4_args[@]} $(basename $script) > /dev/null \
+	&& ktap_test_pass "ipv4" || ktap_test_fail "ipv4"
+packetdrill ${ipv6_args[@]} $(basename $script) > /dev/null \
+	&& ktap_test_pass "ipv6" || ktap_test_fail "ipv6"
+
+ktap_finished
diff --git a/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt
new file mode 100644
index 000000000000..df49c67645ac
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP_INQ and TCP_CM_INQ on the client side.
+`./defaults.sh
+`
+
+// Create a socket and set it to non-blocking.
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0	fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Connect to the server and enable TCP_INQ.
+   +0	connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0	setsockopt(3, SOL_TCP, TCP_INQ, [1], 4) = 0
+
+   +0	> S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01	< S. 0:0(0) ack 1 win 5792 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 7>
+   +0	> . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700>
+
+// Now we have 10K of data ready on the socket.
+   +0	< . 1:10001(10000) ack 1 win 514
+   +0	> . 1:1(0) ack 10001 <nop,nop,TS val 200 ecr 700>
+
+// We read 1K and we should have 9K ready to read.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 1000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=9000}]}, 0) = 1000
+// We read 9K and we should have no further data ready to read.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 9000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=0}]}, 0) = 9000
+
+// Server sends more data and closes the connections.
+   +0	< F. 10001:20001(10000) ack 1 win 514
+   +0	> . 1:1(0) ack 20002 <nop,nop,TS val 200 ecr 700>
+
+// We read 10K and we should have one "fake" byte because the connection is
+// closed.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 10000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=1}]}, 0) = 10000
+// Now, receive EOF.
+   +0	read(3, ..., 2000) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt
new file mode 100644
index 000000000000..04a5e2590c62
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP_INQ and TCP_CM_INQ on the server side.
+`./defaults.sh
+`
+
+// Initialize connection
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0	bind(3, ..., ...) = 0
+   +0	listen(3, 1) = 0
+
+   +0	< S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0	> S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01	< . 1:1(0) ack 1 win 514
+
+// Accept the connection and enable TCP_INQ.
+   +0	accept(3, ..., ...) = 4
+   +0	setsockopt(4, SOL_TCP, TCP_INQ, [1], 4) = 0
+
+// Now we have 10K of data ready on the socket.
+   +0	< . 1:10001(10000) ack 1 win 514
+   +0	> . 1:1(0) ack 10001
+
+// We read 2K and we should have 8K ready to read.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 2000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=8000}]}, 0) = 2000
+// We read 8K and we should have no further data ready to read.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 8000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=0}]}, 0) = 8000
+// Client sends more data and closes the connections.
+   +0	< F. 10001:20001(10000) ack 1 win 514
+   +0	> . 1:1(0) ack 20002
+
+// We read 10K and we should have one "fake" byte because the connection is
+// closed.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 10000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=1}]}, 0) = 10000
+// Now, receive error.
+   +0	read(3, ..., 2000) = -1 ENOTCONN (Transport endpoint is not connected)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt b/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt
new file mode 100644
index 000000000000..25dfef95d3f8
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test what happens when client does not provide MD5 on SYN,
+// but then does on the ACK that completes the three-way handshake.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+// Ooh, weird: client provides MD5 option on the ACK:
+ +.01 < . 1:1(0) ack 1 win 514 <md5 000102030405060708090a0b0c0d0e0f,nop,nop>
+ +.01 < . 1:1(0) ack 1 win 514 <md5 000102030405060708090a0b0c0d0e0f,nop,nop>
+
+// The TCP listener refcount should be 2, but on buggy kernels it can be 0:
+   +0 `grep " 0A " /proc/net/tcp /proc/net/tcp6 | grep ":1F90"`
+
+// Now here comes the legit ACK:
+ +.01 < . 1:1(0) ack 1 win 514
+
+// Make sure the connection is OK:
+   +0 accept(3, ..., ...) = 4
+
+ +.01 write(4, ..., 1000) = 1000
-- 
cgit v1.2.3


From a8927f69e85ec3508085e1042ca8ffe1c1ededae Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Wed, 24 Jul 2024 00:41:08 -0700
Subject: tools/virtio:Fix the wrong format specifier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The unsigned int should use "%u" instead of "%d".

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Message-Id: <20240724074108.9530-1-zhujun2@cmss.chinamobile.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 tools/virtio/ringtest/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/virtio/ringtest/main.c b/tools/virtio/ringtest/main.c
index 5a18b2301a63..e471d8e7cfaa 100644
--- a/tools/virtio/ringtest/main.c
+++ b/tools/virtio/ringtest/main.c
@@ -276,7 +276,7 @@ static void help(void)
 	fprintf(stderr, "Usage: <test> [--help]"
 		" [--host-affinity H]"
 		" [--guest-affinity G]"
-		" [--ring-size R (default: %d)]"
+		" [--ring-size R (default: %u)]"
 		" [--run-cycles C (default: %d)]"
 		" [--batch b]"
 		" [--outstanding o]"
-- 
cgit v1.2.3


From d92f490cba69fbab495ac21eefff01130be3ec31 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Sep 2024 09:50:29 -0300
Subject: perf trace: Mark bpf's attr as from_user

This one has no specific pretty printer right now, so will be handled by
the generic BTF based one later in this patch series.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 2bdbb6813512..72f61e928e2f 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1078,7 +1078,8 @@ static const struct syscall_fmt syscall_fmts[] = {
 		   [1] = SCA_SOCKADDR_FROM_USER(umyaddr),
 		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
 	{ .name	    = "bpf",
-	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
+	  .arg = { [0] = STRARRAY(cmd, bpf_cmd),
+		   [1] = { .from_user = true /* attr */, }, } },
 	{ .name	    = "brk",	    .hexret = true,
 	  .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
 	{ .name     = "clock_gettime",
-- 
cgit v1.2.3


From 7f403067288f05335f7d8b8283ae1dac6cb1ab31 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Sun, 25 Aug 2024 00:33:16 +0800
Subject: perf trace: Add trace__bpf_sys_enter_beauty_map() to prepare for
 fetching data in BPF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set up beauty_map, load it to BPF, in such format: if argument No.3 is a
struct of size 32 bytes (of syscall number 114) beauty_map[114][2] = 32;

if argument No.3 is a string (of syscall number 114) beauty_map[114][2] =
1;

if argument No.3 is a buffer, its size is indicated by argument No.4 (of
syscall number 114) beauty_map[114][2] = -4; /* -1 ~ -6, we'll read this
buffer size in BPF  */

Committer notes:

Moved syscall_arg_fmt__cache_btf_struct() from a ifdef
HAVE_LIBBPF_SUPPORT to closer to where it is used, that is ifdef'ed on
HAVE_BPF_SKEL and thus breaks the build when building with
BUILD_BPF_SKEL=0, as detected using 'make -C tools/perf build-test'.

Also add 'struct beauty_map_enter' to tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
as we're using it in this patch, otherwise we get this while trying to
build at this point in the original patch series:

  builtin-trace.c: In function ‘trace__init_syscalls_bpf_prog_array_maps’:
  builtin-trace.c:3725:58: error: ‘struct <anonymous>’ has no member named ‘beauty_map_enter’
   3725 |         int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter);
        |

We also have to take into account syscall_arg_fmt.from_user when telling
the kernel what to copy in the sys_enter generic collector, we don't
want to collect bogus data in buffers that will only be available to us
at sys_exit time, i.e. after the kernel has filled it, so leave this for
when we have such a sys_exit based collector.

Committer testing:

Not wired up yet, so all continues to work, using the existing BPF
collector and userspace beautifiers that are augmentation aware:

  root@number:~# rm -f 987654 ; touch 123456 ; perf trace -e rename* mv 123456 987654
       0.000 ( 0.031 ms): mv/20888 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0
  root@number:~# perf trace -e connect,sendto ping -c 1 www.google.com
       0.000 ( 0.014 ms): ping/20892 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0
       0.040 ( 0.003 ms): ping/20892 sendto(fd: 5, buff: 0x560b4ff17980, len: 97, flags: DONTWAIT|NOSIGNAL) = 97
       0.480 ( 0.017 ms): ping/20892 sendto(fd: 5, buff: 0x7ffd82d07150, len: 20, addr: { .family: NETLINK }, addr_len: 0xc) = 20
       0.526 ( 0.014 ms): ping/20892 connect(fd: 5, uservaddr: { .family: INET6, port: 0, addr: 2800:3f0:4004:810::2004 }, addrlen: 28) = 0
       0.542 ( 0.002 ms): ping/20892 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
       0.544 ( 0.004 ms): ping/20892 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 142.251.135.100 }, addrlen: 16) = 0
       0.559 ( 0.002 ms): ping/20892 connect(fd: 5, uservaddr: { .family: INET, port: 1025, addr: 142.251.135.100 }, addrlen: 16PING www.google.com (142.251.135.100) 56(84) bytes of data.
  ) = 0
       0.589 ( 0.058 ms): ping/20892 sendto(fd: 3, buff: 0x560b4ff11ac0, len: 64, addr: { .family: INET, port: 0, addr: 142.251.135.100 }, addr_len: 0x10) = 64
      45.250 ( 0.029 ms): ping/20892 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0
      45.344 ( 0.012 ms): ping/20892 sendto(fd: 5, buff: 0x560b4ff19340, len: 111, flags: DONTWAIT|NOSIGNAL) = 111
  64 bytes from rio09s08-in-f4.1e100.net (142.251.135.100): icmp_seq=1 ttl=49 time=44.4 ms

  --- www.google.com ping statistics ---
  1 packets transmitted, 1 received, 0% packet loss, time 0ms
  rtt min/avg/max/mdev = 44.361/44.361/44.361/0.000 ms
  root@number:~#

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240815013626.935097-4-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20240824163322.60796-3-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c                         | 110 +++++++++++++++++++++
 .../util/bpf_skel/augmented_raw_syscalls.bpf.c     |   7 ++
 2 files changed, 117 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 72f61e928e2f..c672702fc0cf 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -120,6 +120,7 @@ struct syscall_arg_fmt {
 	bool	   show_zero;
 #ifdef HAVE_LIBBPF_SUPPORT
 	const struct btf_type *type;
+	int	   type_id; /* used in btf_dump */
 #endif
 };
 
@@ -3462,6 +3463,23 @@ out_enomem:
 }
 
 #ifdef HAVE_BPF_SKEL
+static int syscall_arg_fmt__cache_btf_struct(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
+{
+       int id;
+
+	if (arg_fmt->type != NULL)
+		return -1;
+
+       id = btf__find_by_name(btf, type);
+       if (id < 0)
+		return -1;
+
+       arg_fmt->type    = btf__type_by_id(btf, id);
+       arg_fmt->type_id = id;
+
+       return 0;
+}
+
 static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
 {
 	struct bpf_program *pos, *prog = NULL;
@@ -3537,6 +3555,87 @@ static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
 	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
 }
 
+static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigned int *beauty_array)
+{
+	struct tep_format_field *field;
+	struct syscall *sc = trace__syscall_info(trace, NULL, key);
+	const struct btf_type *bt;
+	char *struct_offset, *tmp, name[32];
+	bool can_augment = false;
+	int i, cnt;
+
+	if (sc == NULL)
+		return -1;
+
+	trace__load_vmlinux_btf(trace);
+	if (trace->btf == NULL)
+		return -1;
+
+	for (i = 0, field = sc->args; field; ++i, field = field->next) {
+		struct_offset = strstr(field->type, "struct ");
+
+		// XXX We're only collecting pointer payloads _from_ user space
+		if (!sc->arg_fmt[i].from_user)
+			continue;
+
+		if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct */
+			struct_offset += 7;
+
+			/* for 'struct foo *', we only want 'foo' */
+			for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) {
+			}
+
+			strncpy(name, struct_offset, cnt);
+			name[cnt] = '\0';
+
+			/* cache struct's btf_type and type_id */
+			if (syscall_arg_fmt__cache_btf_struct(&sc->arg_fmt[i], trace->btf, name))
+				continue;
+
+			bt = sc->arg_fmt[i].type;
+			beauty_array[i] = bt->size;
+			can_augment = true;
+		} else if (field->flags & TEP_FIELD_IS_POINTER && /* string */
+			   strcmp(field->type, "const char *") == 0 &&
+			   (strstr(field->name, "name") ||
+			    strstr(field->name, "path") ||
+			    strstr(field->name, "file") ||
+			    strstr(field->name, "root") ||
+			    strstr(field->name, "key") ||
+			    strstr(field->name, "special") ||
+			    strstr(field->name, "type") ||
+			    strstr(field->name, "description"))) {
+			beauty_array[i] = 1;
+			can_augment = true;
+		} else if (field->flags & TEP_FIELD_IS_POINTER && /* buffer */
+			   strstr(field->type, "char *") &&
+			   (strstr(field->name, "buf") ||
+			    strstr(field->name, "val") ||
+			    strstr(field->name, "msg"))) {
+			int j;
+			struct tep_format_field *field_tmp;
+
+			/* find the size of the buffer that appears in pairs with buf */
+			for (j = 0, field_tmp = sc->args; field_tmp; ++j, field_tmp = field_tmp->next) {
+				if (!(field_tmp->flags & TEP_FIELD_IS_POINTER) && /* only integers */
+				    (strstr(field_tmp->name, "count") ||
+				     strstr(field_tmp->name, "siz") ||  /* size, bufsiz */
+				     (strstr(field_tmp->name, "len") && strcmp(field_tmp->name, "filename")))) {
+					 /* filename's got 'len' in it, we don't want that */
+					beauty_array[i] = -(j + 1);
+					can_augment = true;
+					break;
+				}
+			}
+		}
+	}
+
+	if (can_augment)
+		return 0;
+
+	return -1;
+}
+
 static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
 {
 	struct tep_format_field *field, *candidate_field;
@@ -3641,7 +3740,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
 {
 	int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
 	int map_exit_fd  = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
+	int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter);
 	int err = 0;
+	unsigned int beauty_array[6];
 
 	for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
 		int prog_fd, key = syscalltbl__id_at_idx(trace->sctbl, i);
@@ -3660,6 +3761,15 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
 		err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
 		if (err)
 			break;
+
+		/* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */
+		memset(beauty_array, 0, sizeof(beauty_array));
+		err = trace__bpf_sys_enter_beauty_map(trace, key, (unsigned int *)beauty_array);
+		if (err)
+			continue;
+		err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY);
+		if (err)
+			break;
 	}
 
 	/*
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 9c7d2f855294..4ebce6763743 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -124,6 +124,13 @@ struct augmented_args_tmp {
 	__uint(max_entries, 1);
 } augmented_args_tmp SEC(".maps");
 
+struct beauty_map_enter {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, int);
+	__type(value, __u32[6]);
+	__uint(max_entries, 512);
+} beauty_map_enter SEC(".maps");
+
 static inline struct augmented_args_payload *augmented_args_payload(void)
 {
 	int key = 0;
-- 
cgit v1.2.3


From cb32035214b9a09df29680a6a4da64d34579bb8f Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Sun, 25 Aug 2024 00:33:18 +0800
Subject: perf trace: Pretty print struct data

Change the arg->augmented.args to arg->augmented.args->value to skip the
header for customized pretty printers, since we collect data in BPF
using the general augment_sys_enter(), which always adds the header.

Use btf_dump API to pretty print augmented struct pointer.

Prefer existed pretty-printer than btf general pretty-printer.

set compact = true and skip_names = true, so that no newline character
and argument name are printed.

Committer notes:

Simplified the btf_dump_snprintf callback to avoid using multiple
buffers, as discussed in the thread accessible via the Link tag below.

Also made it do:

  dump_data_opts.skip_names = !arg->trace->show_arg_names;

I.e. show the type and struct field names according to that tunable, we
probably need another tunable just for this, but for now if the user
wants to see syscall names in addition to its value, it makes sense to
see the struct field names according to that tunable.

Committer testing:

The following have explicitely set beautifiers (SCA_FILENAME,
SCA_SOCKADDR and SCA_PERF_ATTR), SCA_FILENAME is here just because we
have been wiring up the "renameat2" ("renameat" until recently), so it
doesn't use the introduced generic fallback (btf_struct_scnprintf(), see
the definition of SCA_PERF_ATTR, SCA_SOCKADDR to see the more feature
rich beautifiers, that are not using BTF):

  root@number:~# rm -f 987654 ; touch 123456 ; perf trace -e rename* mv 123456 987654
       0.000 ( 0.039 ms): mv/258478 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0
  root@number:~# perf trace -e connect,sendto ping -c 1 www.google.com
       0.000 ( 0.014 ms): ping/258481 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0
       0.040 ( 0.003 ms): ping/258481 sendto(fd: 5, buff: 0x55bc317a6980, len: 97, flags: DONTWAIT|NOSIGNAL) = 97
      18.742 ( 0.020 ms): ping/258481 sendto(fd: 5, buff: 0x7ffc04768df0, len: 20, addr: { .family: NETLINK }, addr_len: 0xc) = 20
  PING www.google.com (142.251.129.68) 56(84) bytes of data.
      18.783 ( 0.012 ms): ping/258481 connect(fd: 5, uservaddr: { .family: INET6, port: 0, addr: 2800:3f0:4004:810::2004 }, addrlen: 28) = 0
      18.797 ( 0.001 ms): ping/258481 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16)           = 0
      18.800 ( 0.004 ms): ping/258481 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 142.251.129.68 }, addrlen: 16) = 0
      18.815 ( 0.002 ms): ping/258481 connect(fd: 5, uservaddr: { .family: INET, port: 1025, addr: 142.251.129.68 }, addrlen: 16) = 0
      18.862 ( 0.023 ms): ping/258481 sendto(fd: 3, buff: 0x55bc317a0ac0, len: 64, addr: { .family: INET, port: 0, addr: 142.251.129.68 }, addr_len: 0x10) = 64
      63.330 ( 0.038 ms): ping/258481 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0
      63.435 ( 0.010 ms): ping/258481 sendto(fd: 5, buff: 0x55bc317a8340, len: 110, flags: DONTWAIT|NOSIGNAL) = 110
  64 bytes from rio07s07-in-f4.1e100.net (142.251.129.68): icmp_seq=1 ttl=49 time=44.2 ms

  --- www.google.com ping statistics ---
  1 packets transmitted, 1 received, 0% packet loss, time 0ms
  rtt min/avg/max/mdev = 44.158/44.158/44.158/0.000 ms
  root@number:~# perf trace -e perf_event_open perf stat -e instructions,cache-misses,syscalls:sys_enter*sleep* sleep 1.23456789
       0.000 ( 0.010 ms): :258487/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), config: 0xa00000000, disabled: 1, { bp_len, config2 }: 0x900000000, branch_sample_type: USER|COUNTERS, sample_regs_user: 0x3f1b7ffffffff, sample_stack_user: 258487, clockid: -599052088, sample_regs_intr: 0x60a000003eb, sample_max_stack: 14, sig_data: 120259084288 }, cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 3
       0.016 ( 0.002 ms): :258487/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), config: 0x400000000, disabled: 1, { bp_len, config2 }: 0x900000000, branch_sample_type: USER|COUNTERS, sample_regs_user: 0x3f1b7ffffffff, sample_stack_user: 258487, clockid: -599044082, sample_regs_intr: 0x60a000003eb, sample_max_stack: 14, sig_data: 120259084288 }, cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
       1.838 ( 0.006 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0xa00000001, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 5
       1.846 ( 0.002 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000001, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 6
       1.849 ( 0.002 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0xa00000003, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 7
       1.851 ( 0.002 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000003, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 9
       1.853 ( 0.600 ms): perf/258487 perf_event_open(attr_uptr: { type: 2 (tracepoint), size: 136, config: 0x190 (syscalls:sys_enter_nanosleep), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 10
       2.456 ( 0.016 ms): perf/258487 perf_event_open(attr_uptr: { type: 2 (tracepoint), size: 136, config: 0x196 (syscalls:sys_enter_clock_nanosleep), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 11

   Performance counter stats for 'sleep 1.23456789':

           1,402,839      cpu_atom/instructions/
       <not counted>      cpu_core/instructions/                                                  (0.00%)
              11,066      cpu_atom/cache-misses/
       <not counted>      cpu_core/cache-misses/                                                  (0.00%)
                   0      syscalls:sys_enter_nanosleep
                   1      syscalls:sys_enter_clock_nanosleep

         1.236246714 seconds time elapsed

         0.000000000 seconds user
         0.001308000 seconds sys

  root@number:~#

Now if we use it even for the ones we have a specific beautifier in
tools/perf/trace/beauty, i.e. use btf_struct_scnprintf() for all
structs, by adding the following patch:

  @@ -2316,7 +2316,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,

   			default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;

  -			if (default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
  +			if (1 || (default_scnprintf == NULL || default_scnprintf == SCA_PTR)) {
   				btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
   								   size - printed, val, field->type);
   				if (btf_printed) {

We get:

  root@number:~# perf trace -e connect,sendto ping -c 1 www.google.com
  PING www.google.com (142.251.129.68) 56(84) bytes of data.
       0.000 ( 0.015 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)1,(union){.sa_data_min = (char[14])['/','r','u','n','/','s','y','s','t','e','m','d','/','r',],},}, addrlen: 42) = 0
       0.046 ( 0.004 ms): ping/283259 sendto(fd: 5, buff: 0x559b008ae980, len: 97, flags: DONTWAIT|NOSIGNAL) = 97
       0.353 ( 0.012 ms): ping/283259 sendto(fd: 5, buff: 0x7ffc01294960, len: 20, addr: (struct sockaddr){.sa_family = (sa_family_t)16,}, addr_len: 0xc) = 20
       0.377 ( 0.006 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)2,}, addrlen: 16) = 0
       0.388 ( 0.010 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)10,}, addrlen: 28) = 0
       0.402 ( 0.001 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)2,(union){.sa_data_min = (char[14])[4,1,142,251,129,'D',],},}, addrlen: 16) = 0
       0.425 ( 0.045 ms): ping/283259 sendto(fd: 3, buff: 0x559b008a8ac0, len: 64, addr: (struct sockaddr){.sa_family = (sa_family_t)2,}, addr_len: 0x10) = 64
  64 bytes from rio07s07-in-f4.1e100.net (142.251.129.68): icmp_seq=1 ttl=49 time=44.1 ms

  --- www.google.com ping statistics ---
  1 packets transmitted, 1 received, 0% packet loss, time 0ms
  rtt min/avg/max/mdev = 44.113/44.113/44.113/0.000 ms
      44.849 ( 0.038 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)1,(union){.sa_data_min = (char[14])['/','r','u','n','/','s','y','s','t','e','m','d','/','r',],},}, addrlen: 42) = 0
      44.927 ( 0.006 ms): ping/283259 sendto(fd: 5, buff: 0x559b008b03d0, len: 110, flags: DONTWAIT|NOSIGNAL) = 110
  root@number:~#

Which looks sane, i.e.:

  18.800 ( 0.004 ms): ping/258481 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 142.251.129.68 }, addrlen: 16) = 0

Becomes:

   0.402 ( 0.001 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)2,(union){.sa_data_min = (char[14])[4,1,142,251,129,'D',],},}, addrlen: 16) = 0

And.

  #define AF_UNIX         1       /* Unix domain sockets          */
  #define AF_LOCAL        1       /* POSIX name for AF_UNIX       */
  #define AF_INET         2       /* Internet IP Protocol         */
  <SNIP>
  #define AF_INET6        10      /* IP version 6                 */

And 'D' == 68, so the preexisting sockaddr BPF collector is working with
the new generic BTF pretty printer (btf_struct_scnprintf()), its just
that it doesn't know about 'struct sockaddr' besides what is in BTF,
i.e. its an array of bytes, not an IPv4 address that needs extra
massaging.

Ditto for the 'struct perf_event_attr' case:

       1.851 ( 0.002 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000003, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 9

Becomes:

       2.081 ( 0.002 ms): :283304/283304 perf_event_open(attr_uptr: (struct perf_event_attr){.size = (__u32)136,.config = (__u64)17179869187,.sample_type = (__u64)65536,.read_format = (__u64)3,.disabled = (__u64)0x1,.inherit = (__u64)0x1,.enable_on_exec = (__u64)0x1,.exclude_guest = (__u64)0x1,}, pid: 283305 (sleep), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 9

hex(17179869187) = 0x400000003, etc.

read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING is

enum perf_event_read_format {
        PERF_FORMAT_TOTAL_TIME_ENABLED          = 1U << 0,
        PERF_FORMAT_TOTAL_TIME_RUNNING          = 1U << 1,

and so on.

We need to work with the libbpf btf dump api to get one output that
matches the 'perf trace'/strace expectations/format, but having this in
this current form is already an improvement to 'perf trace', so lets
improve from what we have.

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240815013626.935097-7-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20240824163322.60796-5-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 65 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index c672702fc0cf..41d36a351bfd 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1002,6 +1002,54 @@ static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, c
 	return 0;
 }
 
+struct trace_btf_dump_snprintf_ctx {
+	char   *bf;
+	size_t printed, size;
+};
+
+static void trace__btf_dump_snprintf(void *vctx, const char *fmt, va_list args)
+{
+	struct trace_btf_dump_snprintf_ctx *ctx = vctx;
+
+	ctx->printed += vscnprintf(ctx->bf + ctx->printed, ctx->size - ctx->printed, fmt, args);
+}
+
+static size_t btf_struct_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, struct syscall_arg *arg)
+{
+	struct trace_btf_dump_snprintf_ctx ctx = {
+		.bf   = bf,
+		.size = size,
+	};
+	struct augmented_arg *augmented_arg = arg->augmented.args;
+	int type_id = arg->fmt->type_id, consumed;
+	struct btf_dump *btf_dump;
+
+	LIBBPF_OPTS(btf_dump_opts, dump_opts);
+	LIBBPF_OPTS(btf_dump_type_data_opts, dump_data_opts);
+
+	if (arg == NULL || arg->augmented.args == NULL)
+		return 0;
+
+	dump_data_opts.compact	  = true;
+	dump_data_opts.skip_names = !arg->trace->show_arg_names;
+
+	btf_dump = btf_dump__new(btf, trace__btf_dump_snprintf, &ctx, &dump_opts);
+	if (btf_dump == NULL)
+		return 0;
+
+	/* pretty print the struct data here */
+	if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0)
+		return 0;
+
+	consumed = sizeof(*augmented_arg) + augmented_arg->size;
+	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
+	arg->augmented.size -= consumed;
+
+	btf_dump__free(btf_dump);
+
+	return ctx.printed;
+}
+
 static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf,
 				   size_t size, int val, char *type)
 {
@@ -1021,6 +1069,8 @@ static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg,
 
 	if (btf_is_enum(arg_fmt->type))
 		return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
+	else if (btf_is_struct(arg_fmt->type))
+		return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg);
 
 	return 0;
 }
@@ -2236,6 +2286,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 		.show_string_prefix = trace->show_string_prefix,
 	};
 	struct thread_trace *ttrace = thread__priv(thread);
+	void *default_scnprintf;
 
 	/*
 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
@@ -2277,11 +2328,15 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 			if (trace->show_arg_names)
 				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
 
-			btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
-							   size - printed, val, field->type);
-			if (btf_printed) {
-				printed += btf_printed;
-				continue;
+			default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;
+
+			if (default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
+				btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
+								   size - printed, val, field->type);
+				if (btf_printed) {
+					printed += btf_printed;
+					continue;
+				}
 			}
 
 			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
-- 
cgit v1.2.3


From b257fac12f38d7f503b932313d704cee21092350 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Sun, 25 Aug 2024 00:33:19 +0800
Subject: perf trace: Pretty print buffer data

Define TRACE_AUG_MAX_BUF in trace_augment.h data, which is the maximum
buffer size we can augment. BPF will include this header too.

Print buffer in a way that's different than just printing a string, we
print all the control characters in \digits (such as \0 for null, and
\10 for newline, LF).

For character that has a bigger value than 127, we print the digits
instead of the character itself as well.

Committer notes:

Simplified the buffer scnprintf to avoid using multiple buffers as
discussed in the patch review thread.

We can't really all 'buf' args to SCA_BUF as we're collecting so far
just on the sys_enter path, so we would be printing the previous 'read'
arg buffer contents, not what the kernel puts there.

So instead of:
   static int syscall_fmt__cmp(const void *name, const void *fmtp)
  @@ -1987,8 +1989,6 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
  -               else if (strstr(field->type, "char *") && strstr(field->name, "buf"))
  -                       arg->scnprintf = SCA_BUF;

Do:

static const struct syscall_fmt syscall_fmts[] = {
  +       { .name     = "write",      .errpid = true,
  +         .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, from_user = true, }, }, },

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240815013626.935097-8-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20240824163322.60796-6-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c      | 33 +++++++++++++++++++++++++++++++++
 tools/perf/util/trace_augment.h |  6 ++++++
 2 files changed, 39 insertions(+)
 create mode 100644 tools/perf/util/trace_augment.h

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 41d36a351bfd..dab6d905015d 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -65,6 +65,7 @@
 #include "syscalltbl.h"
 #include "rb_resort.h"
 #include "../perf.h"
+#include "trace_augment.h"
 
 #include <errno.h>
 #include <inttypes.h>
@@ -864,6 +865,10 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 	  { .scnprintf	= SCA_FILENAME, \
 	    .from_user	= true, }
 
+static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg);
+
+#define SCA_BUF syscall_arg__scnprintf_buf
+
 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 						struct syscall_arg *arg)
 {
@@ -1387,6 +1392,8 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 	{ .name	    = "waitid",	    .errpid = true,
 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
+	{ .name	    = "write",	    .errpid = true,
+	  .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
 };
 
 static int syscall_fmt__cmp(const void *name, const void *fmtp)
@@ -1758,6 +1765,32 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 	return 0;
 }
 
+#define MAX_CONTROL_CHAR 31
+#define MAX_ASCII 127
+
+static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg)
+{
+	struct augmented_arg *augmented_arg = arg->augmented.args;
+	unsigned char *orig = (unsigned char *)augmented_arg->value;
+	size_t printed = 0;
+	int consumed;
+
+	if (augmented_arg == NULL)
+		return 0;
+
+	for (int j = 0; j < augmented_arg->size; ++j) {
+		bool control_char = orig[j] <= MAX_CONTROL_CHAR || orig[j] >= MAX_ASCII;
+		/* print control characters (0~31 and 127), and non-ascii characters in \(digits) */
+		printed += scnprintf(bf + printed, size - printed, control_char ? "\\%d" : "%c", (int)orig[j]);
+	}
+
+	consumed = sizeof(*augmented_arg) + augmented_arg->size;
+	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
+	arg->augmented.size -= consumed;
+
+	return printed;
+}
+
 static bool trace__filter_duration(struct trace *trace, double t)
 {
 	return t < (trace->duration_filter * NSEC_PER_MSEC);
diff --git a/tools/perf/util/trace_augment.h b/tools/perf/util/trace_augment.h
new file mode 100644
index 000000000000..57a3e5045937
--- /dev/null
+++ b/tools/perf/util/trace_augment.h
@@ -0,0 +1,6 @@
+#ifndef TRACE_AUGMENT_H
+#define TRACE_AUGMENT_H
+
+#define TRACE_AUG_MAX_BUF 32 /* for buffer augmentation in perf trace */
+
+#endif
-- 
cgit v1.2.3


From a68fd6a6cdd309bb51c72df4a39f49aa408728d0 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Sun, 25 Aug 2024 00:33:20 +0800
Subject: perf trace: Collect augmented data using BPF

Include trace_augment.h for TRACE_AUG_MAX_BUF, so that BPF reads
TRACE_AUG_MAX_BUF bytes of buffer maximum.

Determine what type of argument and how many bytes to read from user space, us ing the
value in the beauty_map. This is the relation of parameter type and its corres ponding
value in the beauty map, and how many bytes we read eventually:

string: 1                          -> size of string (till null)
struct: size of struct             -> size of struct
buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_ MAX_BUF)

After reading from user space, we output the augmented data using
bpf_perf_event_output().

If the struct augmenter, augment_sys_enter() failed, we fall back to
using bpf_tail_call().

I have to make the payload 6 times the size of augmented_arg, to pass the
BPF verifier.

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240815013626.935097-10-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20240824163322.60796-7-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../util/bpf_skel/augmented_raw_syscalls.bpf.c     | 107 ++++++++++++++++++++-
 1 file changed, 106 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 4ebce6763743..b2f17cca014b 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -7,6 +7,8 @@
  */
 
 #include "vmlinux.h"
+#include "../trace_augment.h"
+
 #include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
 
@@ -131,6 +133,18 @@ struct beauty_map_enter {
 	__uint(max_entries, 512);
 } beauty_map_enter SEC(".maps");
 
+struct beauty_payload_enter {
+	struct syscall_enter_args args;
+	struct augmented_arg aug_args[6];
+};
+
+struct beauty_payload_enter_map {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, int);
+	__type(value, struct beauty_payload_enter);
+	__uint(max_entries, 1);
+} beauty_payload_enter_map SEC(".maps");
+
 static inline struct augmented_args_payload *augmented_args_payload(void)
 {
 	int key = 0;
@@ -143,6 +157,11 @@ static inline int augmented__output(void *ctx, struct augmented_args_payload *ar
 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
 }
 
+static inline int augmented__beauty_output(void *ctx, void *data, int len)
+{
+	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+}
+
 static inline
 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
 {
@@ -401,6 +420,91 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 	return bpf_map_lookup_elem(pids, &pid) != NULL;
 }
 
+static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
+{
+	bool augmented, do_output = false;
+	int zero = 0, size, aug_size, index, output = 0,
+	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
+	unsigned int nr, *beauty_map;
+	struct beauty_payload_enter *payload;
+	void *arg, *payload_offset;
+
+	/* fall back to do predefined tail call */
+	if (args == NULL)
+		return 1;
+
+	/* use syscall number to get beauty_map entry */
+	nr             = (__u32)args->syscall_nr;
+	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
+
+	/* set up payload for output */
+	payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
+	payload_offset = (void *)&payload->aug_args;
+
+	if (beauty_map == NULL || payload == NULL)
+		return 1;
+
+	/* copy the sys_enter header, which has the syscall_nr */
+	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
+
+	/*
+	 * Determine what type of argument and how many bytes to read from user space, using the
+	 * value in the beauty_map. This is the relation of parameter type and its corresponding
+	 * value in the beauty map, and how many bytes we read eventually:
+	 *
+	 * string: 1			      -> size of string
+	 * struct: size of struct	      -> size of struct
+	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
+	 */
+	for (int i = 0; i < 6; i++) {
+		arg = (void *)args->args[i];
+		augmented = false;
+		size = beauty_map[i];
+		aug_size = size; /* size of the augmented data read from user space */
+
+		if (size == 0 || arg == NULL)
+			continue;
+
+		if (size == 1) { /* string */
+			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
+			/* minimum of 0 to pass the verifier */
+			if (aug_size < 0)
+				aug_size = 0;
+
+			augmented = true;
+		} else if (size > 0 && size <= value_size) { /* struct */
+			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
+				augmented = true;
+		} else if (size < 0 && size >= -6) { /* buffer */
+			index = -(size + 1);
+			aug_size = args->args[index];
+
+			if (aug_size > TRACE_AUG_MAX_BUF)
+				aug_size = TRACE_AUG_MAX_BUF;
+
+			if (aug_size > 0) {
+				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
+					augmented = true;
+			}
+		}
+
+		/* write data to payload */
+		if (augmented) {
+			int written = offsetof(struct augmented_arg, value) + aug_size;
+
+			((struct augmented_arg *)payload_offset)->size = aug_size;
+			output += written;
+			payload_offset += written;
+			do_output = true;
+		}
+	}
+
+	if (!do_output)
+		return 1;
+
+	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
+}
+
 SEC("tp/raw_syscalls/sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
@@ -429,7 +533,8 @@ int sys_enter(struct syscall_enter_args *args)
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
 	 * unaugmented tracepoint payload.
 	 */
-	bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
+	if (augment_sys_enter(args, &augmented_args->args))
+		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
 
 	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
 	return 0;
-- 
cgit v1.2.3


From 3278024540e882b21e915afb507522e8e01ca160 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Sun, 25 Aug 2024 00:33:21 +0800
Subject: perf trace: Add --force-btf for debugging

If --force-btf is enabled, prefer btf_dump general pretty printer to
perf trace's customized pretty printers.

Mostly for debug purposes.

Committer testing:

diff before/after shows we need several improvements to be able to
compare the changes, first we need to cut off/disable mutable data such
as pids and timestamps, then what is left are the buffer addresses
passed from userspace, returned from kernel space, maybe we can ask
'perf trace' to go on making those reproducible.

That would entail a Pointer Address Translation (PAT) like for
networking, that would, for simple, reproducible if not for these
details, workloads, that we would then use in our regression tests.

Enough digression, this is one such diff:

   openat(dfd: CWD, filename: "/usr/share/locale/locale.alias", flags: RDONLY|CLOEXEC) = 3
  -fstat(fd: 3, statbuf: 0x7fff01f212a0)                                 = 0
  -read(fd: 3, buf: 0x5596bab2d630, count: 4096)                         = 2998
  -read(fd: 3, buf: 0x5596bab2d630, count: 4096)                         = 0
  +fstat(fd: 3, statbuf: 0x7ffc163cf0e0)                                 = 0
  +read(fd: 3, buf: 0x55b4e0631630, count: 4096)                         = 2998
  +read(fd: 3, buf: 0x55b4e0631630, count: 4096)                         = 0
   close(fd: 3)                                                          = 0
   openat(dfd: CWD, filename: "/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory)
   openat(dfd: CWD, filename: "/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory)
  @@ -45,7 +45,7 @@
   openat(dfd: CWD, filename: "/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory)
   openat(dfd: CWD, filename: "/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory)
   openat(dfd: CWD, filename: "/usr/share/locale/en/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory)
  -{ .tv_sec: 1, .tv_nsec: 0 }, rmtp: 0x7fff01f21990) = 0
  +(struct __kernel_timespec){.tv_sec = (__kernel_time64_t)1,}, rmtp: 0x7ffc163cf7d0) =

The problem more close to our hands is to make the libbpf BTF pretty
printer to have a mode that closely resembles what we're trying to
resemble: strace output.

Being able to run something with 'perf trace' and with 'strace' and get
the exact same output should be of interest of anybody wanting to have
strace and 'perf trace' regression tested against each other.

That last part is 'perf trace' shot at being something so useful as
strace... ;-)

Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240824163322.60796-8-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index dab6d905015d..c47fde936c33 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -212,6 +212,7 @@ struct trace {
 	bool			show_string_prefix;
 	bool			force;
 	bool			vfs_getname;
+	bool			force_btf;
 	int			trace_pgfaults;
 	char			*perfconfig_events;
 	struct {
@@ -2363,7 +2364,9 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 
 			default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;
 
-			if (default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
+			if (trace->force_btf ||
+			    (default_scnprintf == NULL ||
+			     (default_scnprintf == SCA_PTR && strstr(field->type, "struct")))) {
 				btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
 								   size - printed, val, field->type);
 				if (btf_printed) {
@@ -5171,6 +5174,8 @@ int cmd_trace(int argc, const char **argv)
 	OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay,
 		     "ms to wait before starting measurement after program "
 		     "start"),
+	OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
+		       "to customized ones"),
 	OPTS_EVSWITCH(&trace.evswitch),
 	OPT_END()
 	};
-- 
cgit v1.2.3


From 8c8b4759740359a6f2e7ffdf6531dea871130c63 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 10 Sep 2024 14:53:36 +0200
Subject: libbpf: Fix uretprobe.multi.s programs auto attachment

As reported by Andrii we don't currently recognize uretprobe.multi.s
programs as return probes due to using (wrong) strcmp function.

Using str_has_pfx() instead to match uretprobe.multi prefix.

Tests are passing, because the return program was executed
as entry program and all counts were incremented properly.

Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240910125336.3056271-1-jolsa@kernel.org
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4f29e06c2641..84d4ec0e1f60 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11688,7 +11688,7 @@ static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, stru
 		ret = 0;
 		break;
 	case 3:
-		opts.retprobe = strcmp(probe_type, "uretprobe.multi") == 0;
+		opts.retprobe = str_has_pfx(probe_type, "uretprobe.multi");
 		*link = bpf_program__attach_uprobe_multi(prog, -1, binary_path, func_name, &opts);
 		ret = libbpf_get_error(*link);
 		break;
-- 
cgit v1.2.3


From f04e2ad394e2755d0bb2d858ecb5598718bf00d5 Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Tue, 10 Sep 2024 23:02:07 +0800
Subject: bpftool: Fix undefined behavior in qsort(NULL, 0, ...)

When netfilter has no entry to display, qsort is called with
qsort(NULL, 0, ...). This results in undefined behavior, as UBSan
reports:

net.c:827:2: runtime error: null pointer passed as argument 1, which is declared to never be null

Although the C standard does not explicitly state whether calling qsort
with a NULL pointer when the size is 0 constitutes undefined behavior,
Section 7.1.4 of the C standard (Use of library functions) mentions:

"Each of the following statements applies unless explicitly stated
otherwise in the detailed descriptions that follow: If an argument to a
function has an invalid value (such as a value outside the domain of
the function, or a pointer outside the address space of the program, or
a null pointer, or a pointer to non-modifiable storage when the
corresponding parameter is not const-qualified) or a type (after
promotion) not expected by a function with variable number of
arguments, the behavior is undefined."

To avoid this, add an early return when nf_link_info is NULL to prevent
calling qsort with a NULL pointer.

Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240910150207.3179306-1-visitorckw@gmail.com
---
 tools/bpf/bpftool/net.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index 3c50fc66bb4a..d2242d9f8441 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -880,6 +880,9 @@ static void show_link_netfilter(void)
 		nf_link_count++;
 	}
 
+	if (!nf_link_info)
+		return;
+
 	qsort(nf_link_info, nf_link_count, sizeof(*nf_link_info), netfilter_link_compar);
 
 	for (id = 0; id < nf_link_count; id++) {
-- 
cgit v1.2.3


From f3f16112c65f5923f41d9d7222fe7e4f34bf73ad Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Sep 2024 10:17:21 -0300
Subject: perf trace: Support collecting 'union's with the BPF augmenter

And reuse the BTF based struct pretty printer, with that we can offer
initial support for the 'bpf' syscall's second argument, a 'union
bpf_attr' pointer.

But this is not that satisfactory as the libbpf btf dumper will pretty
print _all_ the union, we need to have a way to say that the first arg
selects the type for the union member to be pretty printed, something
like what pahole does translating the PERF_RECORD_ selector into a name,
and using that name to find a matching struct.

In the case of 'union bpf_attr' it would map PROG_LOAD to one of the
union members, but unfortunately there is no such mapping:

  root@number:~# pahole bpf_attr
  union bpf_attr {
  	struct {
  		__u32              map_type;           /*     0     4 */
  		__u32              key_size;           /*     4     4 */
  		__u32              value_size;         /*     8     4 */
  		__u32              max_entries;        /*    12     4 */
  		__u32              map_flags;          /*    16     4 */
  		__u32              inner_map_fd;       /*    20     4 */
  		__u32              numa_node;          /*    24     4 */
  		char               map_name[16];       /*    28    16 */
  		__u32              map_ifindex;        /*    44     4 */
  		__u32              btf_fd;             /*    48     4 */
  		__u32              btf_key_type_id;    /*    52     4 */
  		__u32              btf_value_type_id;  /*    56     4 */
  		__u32              btf_vmlinux_value_type_id; /*    60     4 */
  		/* --- cacheline 1 boundary (64 bytes) --- */
  		__u64              map_extra;          /*    64     8 */
  		__s32              value_type_btf_obj_fd; /*    72     4 */
  		__s32              map_token_fd;       /*    76     4 */
  	};                                             /*     0    80 */
  	struct {
  		__u32              map_fd;             /*     0     4 */

  		/* XXX 4 bytes hole, try to pack */

  		__u64              key;                /*     8     8 */
  		union {
  			__u64      value;              /*    16     8 */
  			__u64      next_key;           /*    16     8 */
  		};                                     /*    16     8 */
  		__u64              flags;              /*    24     8 */
  	};                                             /*     0    32 */
  	struct {
  		__u64              in_batch;           /*     0     8 */
  		__u64              out_batch;          /*     8     8 */
  		__u64              keys;               /*    16     8 */
  		__u64              values;             /*    24     8 */
  		__u32              count;              /*    32     4 */
  		__u32              map_fd;             /*    36     4 */
  		__u64              elem_flags;         /*    40     8 */
  		__u64              flags;              /*    48     8 */
  	} batch;                                       /*     0    56 */
  	struct {
  		__u32              prog_type;          /*     0     4 */
  		__u32              insn_cnt;           /*     4     4 */
  		__u64              insns;              /*     8     8 */
  		__u64              license;            /*    16     8 */
  		__u32              log_level;          /*    24     4 */
  		__u32              log_size;           /*    28     4 */
  		__u64              log_buf;            /*    32     8 */
  		__u32              kern_version;       /*    40     4 */
  		__u32              prog_flags;         /*    44     4 */
  		char               prog_name[16];      /*    48    16 */
  		/* --- cacheline 1 boundary (64 bytes) --- */
  		__u32              prog_ifindex;       /*    64     4 */
  		__u32              expected_attach_type; /*    68     4 */
  		__u32              prog_btf_fd;        /*    72     4 */
  		__u32              func_info_rec_size; /*    76     4 */
  		__u64              func_info;          /*    80     8 */
  		__u32              func_info_cnt;      /*    88     4 */
  		__u32              line_info_rec_size; /*    92     4 */
  		__u64              line_info;          /*    96     8 */
  		__u32              line_info_cnt;      /*   104     4 */
  		__u32              attach_btf_id;      /*   108     4 */
  		union {
  			__u32      attach_prog_fd;     /*   112     4 */
  			__u32      attach_btf_obj_fd;  /*   112     4 */
  		};                                     /*   112     4 */
  		__u32              core_relo_cnt;      /*   116     4 */
  		__u64              fd_array;           /*   120     8 */
  		/* --- cacheline 2 boundary (128 bytes) --- */
  		__u64              core_relos;         /*   128     8 */
  		__u32              core_relo_rec_size; /*   136     4 */
  		__u32              log_true_size;      /*   140     4 */
  		__s32              prog_token_fd;      /*   144     4 */
  	};                                             /*     0   152 */
  	struct {
  		__u64              pathname;           /*     0     8 */
  		__u32              bpf_fd;             /*     8     4 */
  		__u32              file_flags;         /*    12     4 */
  		__s32              path_fd;            /*    16     4 */
  	};                                             /*     0    24 */
  	struct {
  		union {
  			__u32      target_fd;          /*     0     4 */
  			__u32      target_ifindex;     /*     0     4 */
  		};                                     /*     0     4 */
  		__u32              attach_bpf_fd;      /*     4     4 */
  		__u32              attach_type;        /*     8     4 */
  		__u32              attach_flags;       /*    12     4 */
  		__u32              replace_bpf_fd;     /*    16     4 */
  		union {
  			__u32      relative_fd;        /*    20     4 */
  			__u32      relative_id;        /*    20     4 */
  		};                                     /*    20     4 */
  		__u64              expected_revision;  /*    24     8 */
  	};                                             /*     0    32 */
  	struct {
  		__u32              prog_fd;            /*     0     4 */
  		__u32              retval;             /*     4     4 */
  		__u32              data_size_in;       /*     8     4 */
  		__u32              data_size_out;      /*    12     4 */
  		__u64              data_in;            /*    16     8 */
  		__u64              data_out;           /*    24     8 */
  		__u32              repeat;             /*    32     4 */
  		__u32              duration;           /*    36     4 */
  		__u32              ctx_size_in;        /*    40     4 */
  		__u32              ctx_size_out;       /*    44     4 */
  		__u64              ctx_in;             /*    48     8 */
  		__u64              ctx_out;            /*    56     8 */
  		/* --- cacheline 1 boundary (64 bytes) --- */
  		__u32              flags;              /*    64     4 */
  		__u32              cpu;                /*    68     4 */
  		__u32              batch_size;         /*    72     4 */
  	} test;                                        /*     0    80 */
  	struct {
  		union {
  			__u32      start_id;           /*     0     4 */
  			__u32      prog_id;            /*     0     4 */
  			__u32      map_id;             /*     0     4 */
  			__u32      btf_id;             /*     0     4 */
  			__u32      link_id;            /*     0     4 */
  		};                                     /*     0     4 */
  		__u32              next_id;            /*     4     4 */
  		__u32              open_flags;         /*     8     4 */
  	};                                             /*     0    12 */
  	struct {
  		__u32              bpf_fd;             /*     0     4 */
  		__u32              info_len;           /*     4     4 */
  		__u64              info;               /*     8     8 */
  	} info;                                        /*     0    16 */
  	struct {
  		union {
  			__u32      target_fd;          /*     0     4 */
  			__u32      target_ifindex;     /*     0     4 */
  		};                                     /*     0     4 */
  		__u32              attach_type;        /*     4     4 */
  		__u32              query_flags;        /*     8     4 */
  		__u32              attach_flags;       /*    12     4 */
  		__u64              prog_ids;           /*    16     8 */
  		union {
  			__u32      prog_cnt;           /*    24     4 */
  			__u32      count;              /*    24     4 */
  		};                                     /*    24     4 */

  		/* XXX 4 bytes hole, try to pack */

  		__u64              prog_attach_flags;  /*    32     8 */
  		__u64              link_ids;           /*    40     8 */
  		__u64              link_attach_flags;  /*    48     8 */
  		__u64              revision;           /*    56     8 */
  	} query;                                       /*     0    64 */
  	struct {
  		__u64              name;               /*     0     8 */
  		__u32              prog_fd;            /*     8     4 */

  		/* XXX 4 bytes hole, try to pack */

  		__u64              cookie;             /*    16     8 */
  	} raw_tracepoint;                              /*     0    24 */
  	struct {
  		__u64              btf;                /*     0     8 */
  		__u64              btf_log_buf;        /*     8     8 */
  		__u32              btf_size;           /*    16     4 */
  		__u32              btf_log_size;       /*    20     4 */
  		__u32              btf_log_level;      /*    24     4 */
  		__u32              btf_log_true_size;  /*    28     4 */
  		__u32              btf_flags;          /*    32     4 */
  		__s32              btf_token_fd;       /*    36     4 */
  	};                                             /*     0    40 */
  	struct {
  		__u32              pid;                /*     0     4 */
  		__u32              fd;                 /*     4     4 */
  		__u32              flags;              /*     8     4 */
  		__u32              buf_len;            /*    12     4 */
  		__u64              buf;                /*    16     8 */
  		__u32              prog_id;            /*    24     4 */
  		__u32              fd_type;            /*    28     4 */
  		__u64              probe_offset;       /*    32     8 */
  		__u64              probe_addr;         /*    40     8 */
  	} task_fd_query;                               /*     0    48 */
  	struct {
  		union {
  			__u32      prog_fd;            /*     0     4 */
  			__u32      map_fd;             /*     0     4 */
  		};                                     /*     0     4 */
  		union {
  			__u32      target_fd;          /*     4     4 */
  			__u32      target_ifindex;     /*     4     4 */
  		};                                     /*     4     4 */
  		__u32              attach_type;        /*     8     4 */
  		__u32              flags;              /*    12     4 */
  		union {
  			__u32      target_btf_id;      /*    16     4 */
  			struct {
  				__u64 iter_info;       /*    16     8 */
  				__u32 iter_info_len;   /*    24     4 */
  			};                             /*    16    16 */
  			struct {
  				__u64 bpf_cookie;      /*    16     8 */
  			} perf_event;                  /*    16     8 */
  			struct {
  				__u32 flags;           /*    16     4 */
  				__u32 cnt;             /*    20     4 */
  				__u64 syms;            /*    24     8 */
  				__u64 addrs;           /*    32     8 */
  				__u64 cookies;         /*    40     8 */
  			} kprobe_multi;                /*    16    32 */
  			struct {
  				__u32 target_btf_id;   /*    16     4 */

  				/* XXX 4 bytes hole, try to pack */

  				__u64 cookie;          /*    24     8 */
  			} tracing;                     /*    16    16 */
  			struct {
  				__u32 pf;              /*    16     4 */
  				__u32 hooknum;         /*    20     4 */
  				__s32 priority;        /*    24     4 */
  				__u32 flags;           /*    28     4 */
  			} netfilter;                   /*    16    16 */
  			struct {
  				union {
  					__u32  relative_fd; /*    16     4 */
  					__u32  relative_id; /*    16     4 */
  				};                     /*    16     4 */

  				/* XXX 4 bytes hole, try to pack */

  				__u64 expected_revision; /*    24     8 */
  			} tcx;                         /*    16    16 */
  			struct {
  				__u64 path;            /*    16     8 */
  				__u64 offsets;         /*    24     8 */
  				__u64 ref_ctr_offsets; /*    32     8 */
  				__u64 cookies;         /*    40     8 */
  				__u32 cnt;             /*    48     4 */
  				__u32 flags;           /*    52     4 */
  				__u32 pid;             /*    56     4 */
  			} uprobe_multi;                /*    16    48 */
  			struct {
  				union {
  					__u32  relative_fd; /*    16     4 */
  					__u32  relative_id; /*    16     4 */
  				};                     /*    16     4 */

  				/* XXX 4 bytes hole, try to pack */

  				__u64 expected_revision; /*    24     8 */
  			} netkit;                      /*    16    16 */
  		};                                     /*    16    48 */
  	} link_create;                                 /*     0    64 */
  	struct {
  		__u32              link_fd;            /*     0     4 */
  		union {
  			__u32      new_prog_fd;        /*     4     4 */
  			__u32      new_map_fd;         /*     4     4 */
  		};                                     /*     4     4 */
  		__u32              flags;              /*     8     4 */
  		union {
  			__u32      old_prog_fd;        /*    12     4 */
  			__u32      old_map_fd;         /*    12     4 */
  		};                                     /*    12     4 */
  	} link_update;                                 /*     0    16 */
  	struct {
  		__u32              link_fd;            /*     0     4 */
  	} link_detach;                                 /*     0     4 */
  	struct {
  		__u32              type;               /*     0     4 */
  	} enable_stats;                                /*     0     4 */
  	struct {
  		__u32              link_fd;            /*     0     4 */
  		__u32              flags;              /*     4     4 */
  	} iter_create;                                 /*     0     8 */
  	struct {
  		__u32              prog_fd;            /*     0     4 */
  		__u32              map_fd;             /*     4     4 */
  		__u32              flags;              /*     8     4 */
  	} prog_bind_map;                               /*     0    12 */
  	struct {
  		__u32              flags;              /*     0     4 */
  		__u32              bpffs_fd;           /*     4     4 */
  	} token_create;                                /*     0     8 */
  };

  root@number:~#

So this is one case where BTF gets us only that far, not getting all
the way to automate the pretty printing of unions designed like 'union
bpf_attr', we will need a custom pretty printer for this union, as using
the libbpf union BTF dumper is way too verbose:

  root@number:~# perf trace --max-events 1 -e bpf bpftool map
       0.000 ( 0.054 ms): bpftool/3409073 bpf(cmd: PROG_LOAD, uattr: (union bpf_attr){(struct){.map_type = (__u32)1,.key_size = (__u32)2,.value_size = (__u32)2755142048,.max_entries = (__u32)32764,.map_flags = (__u32)150263906,.inner_map_fd = (__u32)21920,},(struct){.map_fd = (__u32)1,.key = (__u64)140723063628192,(union){.value = (__u64)94145833392226,.next_key = (__u64)94145833392226,},},.batch = (struct){.in_batch = (__u64)8589934593,.out_batch = (__u64)140723063628192,.keys = (__u64)94145833392226,},(struct){.prog_type = (__u32)1,.insn_cnt = (__u32)2,.insns = (__u64)140723063628192,.license = (__u64)94145833392226,},(struct){.pathname = (__u64)8589934593,.bpf_fd = (__u32)2755142048,.file_flags = (__u32)32764,.path_fd = (__s32)150263906,},(struct){(union){.target_fd = (__u32)1,.target_ifindex = (__u32)1,},.attach_bpf_fd = (__u32)2,.attach_type = (__u32)2755142048,.attach_flags = (__u32)32764,.replace_bpf_fd = (__u32)150263906,(union){.relative_fd = (__u32)21920,.relative_id = (__u32)21920,},},.test = (struct){.prog_fd = (__u32)1,.retval = (__u32)2,.data_size_in = (__u32)2755142048,.data_size_out = (__u32)32764,.data_in = (__u64)94145833392226,},(struct){(union){.start_id = (__u32)1,.prog_id = (__u32)1,.map_id = (__u32)1,.btf_id = (__u32)1,.link_id = (__u32)1,},.next_id = (__u32)2,.open_flags = (__u32)2755142048,},.info = (struct){.bpf_fd = (__u32)1,.info_len = (__u32)2,.info = (__u64)140723063628192,},.query = (struct){(union){.target_fd = (__u32)1,.target_ifindex = (__u32)1,},.attach_type = (__u32)2,.query_flags = (__u32)2755142048,.attach_flags = (__u32)32764,.prog_ids = (__u64)94145833392226,},.raw_tracepoint = (struct){.name = (__u64)8589934593,.prog_fd = (__u32)2755142048,.cookie = (__u64)94145833392226,},(struct){.btf = (__u64)8589934593,.btf_log_buf = (__u64)140723063628192,.btf_size = (__u32)150263906,.btf_log_size = (__u32)21920,},.task_fd_query = (struct){.pid = (__u32)1,.fd = (__u32)2,.flags = (__u32)2755142048,.buf_len = (__u32)32764,.buf = (__u64)94145833392226,},.link_create = (struct){(union){.prog_fd = (__u32)1,.map_fd = (__u32)1,},(u) = 3
  root@number:~# 2: prog_array  name hid_jmp_table  flags 0x0
  	key 4B  value 4B  max_entries 1024  memlock 8440B
  	owner_prog_type tracing  owner jited
  13: hash_of_maps  name cgroup_hash  flags 0x0
  	key 8B  value 4B  max_entries 2048  memlock 167584B
  	pids systemd(1)
  960: array  name libbpf_global  flags 0x0
  	key 4B  value 32B  max_entries 1  memlock 280B
  961: array  name pid_iter.rodata  flags 0x480
  	key 4B  value 4B  max_entries 1  memlock 8192B
  	btf_id 1846  frozen
  	pids bpftool(3409073)
  962: array  name libbpf_det_bind  flags 0x0
  	key 4B  value 32B  max_entries 1  memlock 280B

  root@number:~#

For simpler unions this may be better than not seeing any payload, so
keep it there.

Acked-by: Howard Chu <howardchu95@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZuBLat8cbadILNLA@x1
[ Removed needless parenteses in the if block leading to the trace__btf_scnprintf() call, as per Howard's review comments ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index c47fde936c33..ed4c5e637e24 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1075,7 +1075,7 @@ static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg,
 
 	if (btf_is_enum(arg_fmt->type))
 		return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
-	else if (btf_is_struct(arg_fmt->type))
+	else if (btf_is_struct(arg_fmt->type) || btf_is_union(arg_fmt->type))
 		return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg);
 
 	return 0;
@@ -2364,9 +2364,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 
 			default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;
 
-			if (trace->force_btf ||
-			    (default_scnprintf == NULL ||
-			     (default_scnprintf == SCA_PTR && strstr(field->type, "struct")))) {
+			if (trace->force_btf || default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
 				btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
 								   size - printed, val, field->type);
 				if (btf_printed) {
@@ -3663,14 +3661,18 @@ static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigne
 		return -1;
 
 	for (i = 0, field = sc->args; field; ++i, field = field->next) {
-		struct_offset = strstr(field->type, "struct ");
-
 		// XXX We're only collecting pointer payloads _from_ user space
 		if (!sc->arg_fmt[i].from_user)
 			continue;
 
-		if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct */
-			struct_offset += 7;
+		struct_offset = strstr(field->type, "struct ");
+		if (struct_offset == NULL)
+			struct_offset = strstr(field->type, "union ");
+		else
+			struct_offset++; // "union" is shorter
+
+		if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */
+			struct_offset += 6;
 
 			/* for 'struct foo *', we only want 'foo' */
 			for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) {
-- 
cgit v1.2.3


From 375f9262ac81f56f41dab41f089f0e43b2792ae6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Sep 2024 10:52:27 -0300
Subject: perf trace: Mark the rlim arg in the prlimit64 and setrlimit syscalls
 as coming from user space

With that it uses the generic BTF based pretty printer:

  root@number:~# perf trace -e prlimit64
       0.000 ( 0.004 ms): :3417020/3417020 prlimit64(resource: NOFILE, old_rlim: 0x7fb8842fe3b0)      = 0
       0.126 ( 0.003 ms): Chroot Helper/3417022 prlimit64(resource: NOFILE, old_rlim: 0x7fb8842fdfd0) = 0
      12.557 ( 0.005 ms): firefox/3417020 prlimit64(resource: STACK, old_rlim: 0x7ffe9ade1b80)        = 0
      26.640 ( 0.006 ms): MainThread/3417020 prlimit64(resource: STACK, old_rlim: 0x7ffe9ade1780)     = 0
      27.553 ( 0.002 ms): Web Content/3417020 prlimit64(resource: AS, old_rlim: 0x7ffe9ade1660)       = 0
      29.405 ( 0.003 ms): Web Content/3417020 prlimit64(resource: NOFILE, old_rlim: 0x7ffe9ade0c80)   = 0
      30.471 ( 0.002 ms): Web Content/3417020 prlimit64(resource: RTTIME, old_rlim: 0x7ffe9ade1370)   = 0
      30.485 ( 0.001 ms): Web Content/3417020 prlimit64(resource: RTTIME, new_rlim: (struct rlimit64){.rlim_cur = (__u64)50000,.rlim_max = (__u64)200000,}) = 0
      31.779 ( 0.001 ms): Web Content/3417020 prlimit64(resource: STACK, old_rlim: 0x7ffe9ade1670)    = 0
  ^Croot@number:~#

Better than before, still needs improvements in the configurability of
the libbpf BTF dumper to get it to the strace output standard.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZuBQI-f8CGpuhIdH@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index ed4c5e637e24..838e179a2573 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1309,7 +1309,8 @@ static const struct syscall_fmt syscall_fmts[] = {
 	{ .name	    = "pread", .alias = "pread64", },
 	{ .name	    = "preadv", .alias = "pread", },
 	{ .name	    = "prlimit64",
-	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
+	  .arg = { [1] = STRARRAY(resource, rlimit_resources),
+		   [2] = { .from_user = true /* new_rlim */, }, }, },
 	{ .name	    = "pwrite", .alias = "pwrite64", },
 	{ .name	    = "readlinkat",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
@@ -1352,7 +1353,8 @@ static const struct syscall_fmt syscall_fmts[] = {
 	{ .name	    = "setitimer",
 	  .arg = { [0] = STRARRAY(which, itimers), }, },
 	{ .name	    = "setrlimit",
-	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
+	  .arg = { [0] = STRARRAY(resource, rlimit_resources),
+		   [1] = { .from_user = true /* rlim */, }, }, },
 	{ .name	    = "setsockopt",
 	  .arg = { [1] = STRARRAY(level, socket_level), }, },
 	{ .name	    = "socket",
-- 
cgit v1.2.3


From 206dcfca1f5cfac344bd88e5ed4b6a095014b91f Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 10 Sep 2024 15:04:00 +0100
Subject: perf build: Autodetect minimum required llvm-dev version

The new LLVM addr2line feature requires a minimum version of 13 to
compile. Add a feature check for the version so that NO_LLVM=1 doesn't
need to be explicitly added. Leave the existing llvm feature check
intact because it's used by tools other than Perf.

This fixes the following compilation error when the llvm-dev version
doesn't match:

  util/llvm-c-helpers.cpp: In function 'char* llvm_name_for_code(dso*, const char*, u64)':
  util/llvm-c-helpers.cpp:178:21: error: 'std::remove_reference_t<llvm::DILineInfo>' {aka 'struct llvm::DILineInfo'} has no member named 'StartAddress'
    178 |   addr, res_or_err->StartAddress ? *res_or_err->StartAddress : 0);

Fixes: c3f8644c21df9b7d ("perf report: Support LLVM for addr2line()")
Signed-off-by: James Clark <james.clark@linaro.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Guilherme Amadio <amadio@gentoo.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Manu Bretelle <chantr4@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Monnet <qmo@kernel.org>
Cc: Steinar H. Gunderson <sesse@google.com>
Link: https://lore.kernel.org/r/20240910140405.568791-1-james.clark@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature           |  2 +-
 tools/build/feature/Makefile           |  9 +++++++++
 tools/build/feature/test-llvm-perf.cpp | 14 ++++++++++++++
 tools/perf/Makefile.config             |  6 +++---
 4 files changed, 27 insertions(+), 4 deletions(-)
 create mode 100644 tools/build/feature/test-llvm-perf.cpp

(limited to 'tools')

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 0717e96d6a0e..427a9389e26c 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -136,7 +136,7 @@ FEATURE_DISPLAY ?=              \
          libunwind              \
          libdw-dwarf-unwind     \
          libcapstone            \
-         llvm                   \
+         llvm-perf              \
          zlib                   \
          lzma                   \
          get_cpuid              \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 12796808f07a..d6a98b3854f8 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -73,6 +73,7 @@ FILES=                                          \
          test-libopencsd.bin			\
          test-clang.bin				\
          test-llvm.bin				\
+         test-llvm-perf.bin   \
          test-llvm-version.bin			\
          test-libaio.bin			\
          test-libzstd.bin			\
@@ -388,6 +389,14 @@ $(OUTPUT)test-llvm.bin:
 		$(shell $(LLVM_CONFIG) --system-libs)		\
 		> $(@:.bin=.make.output) 2>&1
 
+$(OUTPUT)test-llvm-perf.bin:
+	$(BUILDXX) -std=gnu++17 				\
+		-I$(shell $(LLVM_CONFIG) --includedir) 		\
+		-L$(shell $(LLVM_CONFIG) --libdir)		\
+		$(shell $(LLVM_CONFIG) --libs Core BPF)		\
+		$(shell $(LLVM_CONFIG) --system-libs)		\
+		> $(@:.bin=.make.output) 2>&1
+
 $(OUTPUT)test-llvm-version.bin:
 	$(BUILDXX) -std=gnu++17					\
 		-I$(shell $(LLVM_CONFIG) --includedir)		\
diff --git a/tools/build/feature/test-llvm-perf.cpp b/tools/build/feature/test-llvm-perf.cpp
new file mode 100644
index 000000000000..a8cbb67e335e
--- /dev/null
+++ b/tools/build/feature/test-llvm-perf.cpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
+
+#if LLVM_VERSION_MAJOR < 13
+# error "Perf requires llvm-devel/llvm-dev version 13 or greater"
+#endif
+
+int main()
+{
+	llvm::errs() << "Hello World!\n";
+	llvm::llvm_shutdown();
+	return 0;
+}
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 7888c932b1b4..37e3eee2986e 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -981,8 +981,8 @@ ifdef BUILD_NONDISTRO
 endif
 
 ifndef NO_LIBLLVM
-  $(call feature_check,llvm)
-  ifeq ($(feature-llvm), 1)
+  $(call feature_check,llvm-perf)
+  ifeq ($(feature-llvm-perf), 1)
     CFLAGS += -DHAVE_LIBLLVM_SUPPORT
     CFLAGS += $(shell $(LLVM_CONFIG) --cflags)
     CXXFLAGS += -DHAVE_LIBLLVM_SUPPORT
@@ -992,7 +992,7 @@ ifndef NO_LIBLLVM
     EXTLIBS += -lstdc++
     $(call detected,CONFIG_LIBLLVM)
   else
-    $(warning No libllvm found, slower source file resolution, please install llvm-devel/llvm-dev)
+    $(warning No libllvm 13+ found, slower source file resolution, please install llvm-devel/llvm-dev)
     NO_LIBLLVM := 1
   endif
 endif
-- 
cgit v1.2.3


From 332f60ac052f5d7bfc646502f377c600986f10c2 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 10 Sep 2024 15:04:01 +0100
Subject: perf build: Remove unused feature test target

llvm-version was removed in commit 56b11a2126bf ("perf bpf: Remove
support for embedding clang for compiling BPF events (-e foo.c)") but
some parts were left in the Makefile so finish removing them.

Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Daniel Wagner <dwagner@suse.de>
Cc: Guilherme Amadio <amadio@gentoo.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Manu Bretelle <chantr4@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Monnet <qmo@kernel.org>
Cc: Steinar H. Gunderson <sesse@google.com>
Link: https://lore.kernel.org/r/20240910140405.568791-2-james.clark@linaro.org
[ Removed one leftover, 'llvm-version' from FEATURE_TESTS_EXTRA ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature | 1 -
 tools/build/feature/Makefile | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'tools')

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 427a9389e26c..ffd117135094 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -100,7 +100,6 @@ FEATURE_TESTS_EXTRA :=                  \
          libunwind-debug-frame-aarch64  \
          cxx                            \
          llvm                           \
-         llvm-version                   \
          clang                          \
          libbpf                         \
          libbpf-btf__load_from_kernel_by_id \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index d6a98b3854f8..5938cf799dc6 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -74,7 +74,6 @@ FILES=                                          \
          test-clang.bin				\
          test-llvm.bin				\
          test-llvm-perf.bin   \
-         test-llvm-version.bin			\
          test-libaio.bin			\
          test-libzstd.bin			\
          test-clang-bpf-co-re.bin		\
@@ -397,11 +396,6 @@ $(OUTPUT)test-llvm-perf.bin:
 		$(shell $(LLVM_CONFIG) --system-libs)		\
 		> $(@:.bin=.make.output) 2>&1
 
-$(OUTPUT)test-llvm-version.bin:
-	$(BUILDXX) -std=gnu++17					\
-		-I$(shell $(LLVM_CONFIG) --includedir)		\
-		> $(@:.bin=.make.output) 2>&1
-
 $(OUTPUT)test-clang.bin:
 	$(BUILDXX) -std=gnu++17					\
 		-I$(shell $(LLVM_CONFIG) --includedir) 		\
-- 
cgit v1.2.3


From c8b9358778a82534251dc93ebc4601246d5ae04e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Sep 2024 14:42:50 -0700
Subject: perf annotate: Treat 'call' instruction as stack operation

I found some portion of mem-store events sampled on CALL instruction
which has no memory access.  But it actually saves a return address
into stack.  It should be considered as a stack operation like RET
instruction.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240909214251.3033827-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 4990c70b1794..1a347a711dcf 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2474,6 +2474,7 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
 	if (arch__is(arch, "x86")) {
 		if (!strncmp(dl->ins.name, "push", 4) ||
 		    !strncmp(dl->ins.name, "pop", 3) ||
+		    !strncmp(dl->ins.name, "call", 4) ||
 		    !strncmp(dl->ins.name, "ret", 3))
 			return true;
 	}
-- 
cgit v1.2.3


From 02648783c2e1ed24cba23477282fa8cbc1525a77 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Sep 2024 14:42:51 -0700
Subject: perf annotate-data: Add pr_debug_scope()

The pr_debug_scope() is to print more information about the scope DIE
during the instruction tracking so that it can help finding relevant
debug info and the source code like inlined functions more easily.

  $ perf --debug type-profile annotate --data-type
  ...
  -----------------------------------------------------------
  find data type for 0(reg0, reg12) at set_task_cpu+0xdd
  CU for kernel/sched/core.c (die:0x1268dae)
  frame base: cfa=1 fbreg=7
  scope: [3/3] (die:12b6d28) [inlined] set_task_rq       <<<--- (here)
  bb: [9f - dd]
  var [9f] reg3 type='struct task_struct*' size=0x8 (die:0x126aff0)
  var [9f] reg6 type='unsigned int' size=0x4 (die:0x1268e0d)

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240909214251.3033827-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index a0ea4e07e570..976abedca09e 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -131,6 +131,26 @@ static void pr_debug_location(Dwarf_Die *die, u64 pc, int reg)
 	}
 }
 
+static void pr_debug_scope(Dwarf_Die *scope_die)
+{
+	int tag;
+
+	if (!debug_type_profile && verbose < 3)
+		return;
+
+	pr_info("(die:%lx) ", (long)dwarf_dieoffset(scope_die));
+
+	tag = dwarf_tag(scope_die);
+	if (tag == DW_TAG_subprogram)
+		pr_info("[function] %s\n", dwarf_diename(scope_die));
+	else if (tag == DW_TAG_inlined_subroutine)
+		pr_info("[inlined] %s\n", dwarf_diename(scope_die));
+	else if (tag == DW_TAG_lexical_block)
+		pr_info("[block]\n");
+	else
+		pr_info("[unknown] tag=%x\n", tag);
+}
+
 bool has_reg_type(struct type_state *state, int reg)
 {
 	return (unsigned)reg < ARRAY_SIZE(state->regs);
@@ -1305,8 +1325,9 @@ static enum type_match_result find_data_type_block(struct data_loc_info *dloc,
 		if (dwarf_ranges(&scopes[i], 0, &base, &start, &end) < 0)
 			break;
 
-		pr_debug_dtp("scope: [%d/%d] (die:%lx)\n",
-			     i + 1, nr_scopes, (long)dwarf_dieoffset(&scopes[i]));
+		pr_debug_dtp("scope: [%d/%d] ", i + 1, nr_scopes);
+		pr_debug_scope(&scopes[i]);
+
 		src_ip = map__objdump_2rip(dloc->ms->map, start);
 
 again:
-- 
cgit v1.2.3


From ae39ba16554eb4a2d97f752847e93aa428438912 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Sep 2024 13:37:37 -0700
Subject: perf inject: Fix build ID injection

Build ID injection wasn't inserting a sample ID and aligning events to
64 bytes rather than 8. No sample ID means events are unordered and two
different build_id events for the same path, as happens when a file is
replaced, can't be differentiated.

Add in sample ID insertion for the build_id events alongside some
refactoring. The refactoring better aligns the function arguments for
different use cases, such as synthesizing build_id events without
needing to have a dso. The misc bits are explicitly passed as with
callchains the maps/dsos may span user and kernel land, so using
sample->cpumode isn't good enough.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Link: https://lore.kernel.org/r/20240909203740.143492-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c        | 170 ++++++++++++++++++++++++++++---------
 tools/perf/util/build-id.c         |   6 +-
 tools/perf/util/synthetic-events.c |  44 +++++++---
 tools/perf/util/synthetic-events.h |  10 ++-
 4 files changed, 175 insertions(+), 55 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 0ccf80fe8399..24470c57527d 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -130,6 +130,7 @@ struct perf_inject {
 	struct perf_file_section secs[HEADER_FEAT_BITS];
 	struct guest_session	guest_session;
 	struct strlist		*known_build_ids;
+	const struct evsel	*mmap_evsel;
 };
 
 struct event_entry {
@@ -138,8 +139,13 @@ struct event_entry {
 	union perf_event event[];
 };
 
-static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool,
-				struct machine *machine, u8 cpumode, u32 flags);
+static int tool__inject_build_id(const struct perf_tool *tool,
+				 struct perf_sample *sample,
+				 struct machine *machine,
+				 const struct evsel *evsel,
+				 __u16 misc,
+				 const char *filename,
+				 struct dso *dso, u32 flags);
 
 static int output_bytes(struct perf_inject *inject, void *buf, size_t sz)
 {
@@ -422,6 +428,28 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename,
 	return dso;
 }
 
+/*
+ * The evsel used for the sample ID for mmap events. Typically stashed when
+ * processing mmap events. If not stashed, search the evlist for the first mmap
+ * gathering event.
+ */
+static const struct evsel *inject__mmap_evsel(struct perf_inject *inject)
+{
+	struct evsel *pos;
+
+	if (inject->mmap_evsel)
+		return inject->mmap_evsel;
+
+	evlist__for_each_entry(inject->session->evlist, pos) {
+		if (pos->core.attr.mmap) {
+			inject->mmap_evsel = pos;
+			return pos;
+		}
+	}
+	pr_err("No mmap events found\n");
+	return NULL;
+}
+
 static int perf_event__repipe_common_mmap(const struct perf_tool *tool,
 					  union perf_event *event,
 					  struct perf_sample *sample,
@@ -469,12 +497,28 @@ static int perf_event__repipe_common_mmap(const struct perf_tool *tool,
 		}
 
 		if (dso && !dso__hit(dso)) {
-			dso__set_hit(dso);
-			dso__inject_build_id(dso, tool, machine, sample->cpumode, flags);
+			struct evsel *evsel = evlist__event2evsel(inject->session->evlist, event);
+
+			if (evsel) {
+				dso__set_hit(dso);
+				tool__inject_build_id(tool, sample, machine, evsel,
+						      /*misc=*/sample->cpumode,
+						      filename, dso, flags);
+			}
 		}
 	} else {
+		int err;
+
+		/*
+		 * Remember the evsel for lazy build id generation. It is used
+		 * for the sample id header type.
+		 */
+		if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY &&
+		    !inject->mmap_evsel)
+			inject->mmap_evsel = evlist__event2evsel(inject->session->evlist, event);
+
 		/* Create the thread, map, etc. Not done for the unordered inject all case. */
-		int err = perf_event_process(tool, event, sample, machine);
+		err = perf_event_process(tool, event, sample, machine);
 
 		if (err) {
 			dso__put(dso);
@@ -667,16 +711,20 @@ static bool perf_inject__lookup_known_build_id(struct perf_inject *inject,
 	return false;
 }
 
-static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool,
-				struct machine *machine, u8 cpumode, u32 flags)
+static int tool__inject_build_id(const struct perf_tool *tool,
+				 struct perf_sample *sample,
+				 struct machine *machine,
+				 const struct evsel *evsel,
+				 __u16 misc,
+				 const char *filename,
+				 struct dso *dso, u32 flags)
 {
-	struct perf_inject *inject = container_of(tool, struct perf_inject,
-						  tool);
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	int err;
 
-	if (is_anon_memory(dso__long_name(dso)) || flags & MAP_HUGETLB)
+	if (is_anon_memory(filename) || flags & MAP_HUGETLB)
 		return 0;
-	if (is_no_dso_memory(dso__long_name(dso)))
+	if (is_no_dso_memory(filename))
 		return 0;
 
 	if (inject->known_build_ids != NULL &&
@@ -684,24 +732,65 @@ static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool,
 		return 1;
 
 	if (dso__read_build_id(dso) < 0) {
-		pr_debug("no build_id found for %s\n", dso__long_name(dso));
+		pr_debug("no build_id found for %s\n", filename);
 		return -1;
 	}
 
-	err = perf_event__synthesize_build_id(tool, dso, cpumode,
-					      perf_event__repipe, machine);
+	err = perf_event__synthesize_build_id(tool, sample, machine,
+					      perf_event__repipe,
+					      evsel, misc, dso__bid(dso),
+					      filename);
 	if (err) {
-		pr_err("Can't synthesize build_id event for %s\n", dso__long_name(dso));
+		pr_err("Can't synthesize build_id event for %s\n", filename);
 		return -1;
 	}
 
 	return 0;
 }
 
+static int mark_dso_hit(const struct perf_tool *tool,
+			struct perf_sample *sample,
+			struct machine *machine,
+			const struct evsel *mmap_evsel,
+			struct map *map, bool sample_in_dso)
+{
+	struct dso *dso;
+	u16 misc = sample->cpumode;
+
+	if (!map)
+		return 0;
+
+	if (!sample_in_dso) {
+		u16 guest_mask = PERF_RECORD_MISC_GUEST_KERNEL |
+			PERF_RECORD_MISC_GUEST_USER;
+
+		if ((misc & guest_mask) != 0) {
+			misc &= PERF_RECORD_MISC_HYPERVISOR;
+			misc |= __map__is_kernel(map)
+				? PERF_RECORD_MISC_GUEST_KERNEL
+				: PERF_RECORD_MISC_GUEST_USER;
+		} else {
+			misc &= PERF_RECORD_MISC_HYPERVISOR;
+			misc |= __map__is_kernel(map)
+				? PERF_RECORD_MISC_KERNEL
+				: PERF_RECORD_MISC_USER;
+		}
+	}
+	dso = map__dso(map);
+	if (dso && !dso__hit(dso)) {
+		dso__set_hit(dso);
+		tool__inject_build_id(tool, sample, machine,
+				      mmap_evsel, misc, dso__long_name(dso), dso,
+				      map__flags(map));
+	}
+	return 0;
+}
+
 struct mark_dso_hit_args {
 	const struct perf_tool *tool;
+	struct perf_sample *sample;
 	struct machine *machine;
-	u8 cpumode;
+	const struct evsel *mmap_evsel;
 };
 
 static int mark_dso_hit_callback(struct callchain_cursor_node *node, void *data)
@@ -709,16 +798,8 @@ static int mark_dso_hit_callback(struct callchain_cursor_node *node, void *data)
 	struct mark_dso_hit_args *args = data;
 	struct map *map = node->ms.map;
 
-	if (map) {
-		struct dso *dso = map__dso(map);
-
-		if (dso && !dso__hit(dso)) {
-			dso__set_hit(dso);
-			dso__inject_build_id(dso, args->tool, args->machine,
-					     args->cpumode, map__flags(map));
-		}
-	}
-	return 0;
+	return mark_dso_hit(args->tool, args->sample, args->machine,
+			    args->mmap_evsel, map, /*sample_in_dso=*/false);
 }
 
 int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *event,
@@ -728,10 +809,16 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e
 {
 	struct addr_location al;
 	struct thread *thread;
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	struct mark_dso_hit_args args = {
 		.tool = tool,
+		/*
+		 * Use the parsed sample data of the sample event, which will
+		 * have a later timestamp than the mmap event.
+		 */
+		.sample = sample,
 		.machine = machine,
-		.cpumode = sample->cpumode,
+		.mmap_evsel = inject__mmap_evsel(inject),
 	};
 
 	addr_location__init(&al);
@@ -743,13 +830,8 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e
 	}
 
 	if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) {
-		struct dso *dso = map__dso(al.map);
-
-		if (!dso__hit(dso)) {
-			dso__set_hit(dso);
-			dso__inject_build_id(dso, tool, machine,
-					     sample->cpumode, map__flags(al.map));
-		}
+		mark_dso_hit(tool, sample, machine, args.mmap_evsel, al.map,
+			     /*sample_in_dso=*/true);
 	}
 
 	sample__for_each_callchain_node(thread, evsel, sample, PERF_MAX_STACK_DEPTH,
@@ -1159,17 +1241,27 @@ static int process_build_id(const struct perf_tool *tool,
 static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_t machine_pid)
 {
 	struct machine *machine = perf_session__findnew_machine(inject->session, machine_pid);
-	u8 cpumode = dso__is_in_kernel_space(dso) ?
-			PERF_RECORD_MISC_GUEST_KERNEL :
-			PERF_RECORD_MISC_GUEST_USER;
+	struct perf_sample synth_sample = {
+		.pid	   = -1,
+		.tid	   = -1,
+		.time	   = -1,
+		.stream_id = -1,
+		.cpu	   = -1,
+		.period	   = 1,
+		.cpumode   = dso__is_in_kernel_space(dso)
+		? PERF_RECORD_MISC_GUEST_KERNEL
+		: PERF_RECORD_MISC_GUEST_USER,
+	};
 
 	if (!machine)
 		return -ENOMEM;
 
 	dso__set_hit(dso);
 
-	return perf_event__synthesize_build_id(&inject->tool, dso, cpumode,
-					       process_build_id, machine);
+	return perf_event__synthesize_build_id(&inject->tool, &synth_sample, machine,
+					       process_build_id, inject__mmap_evsel(inject),
+					       /*misc=*/synth_sample.cpumode,
+					       dso__bid(dso), dso__long_name(dso));
 }
 
 static int guest_session__add_build_ids_cb(struct dso *dso, void *data)
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 451d145fa4ed..8982f68e7230 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -277,8 +277,8 @@ static int write_buildid(const char *name, size_t name_len, struct build_id *bid
 	struct perf_record_header_build_id b;
 	size_t len;
 
-	len = name_len + 1;
-	len = PERF_ALIGN(len, NAME_ALIGN);
+	len = sizeof(b) + name_len + 1;
+	len = PERF_ALIGN(len, sizeof(u64));
 
 	memset(&b, 0, sizeof(b));
 	memcpy(&b.data, bid->data, bid->size);
@@ -286,7 +286,7 @@ static int write_buildid(const char *name, size_t name_len, struct build_id *bid
 	misc |= PERF_RECORD_MISC_BUILD_ID_SIZE;
 	b.pid = pid;
 	b.header.misc = misc;
-	b.header.size = sizeof(b) + len;
+	b.header.size = len;
 
 	err = do_write(fd, &b, sizeof(b));
 	if (err < 0)
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 0a7f93ae76fb..6bb62e4e2d5d 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -2225,28 +2225,48 @@ int perf_event__synthesize_tracing_data(const struct perf_tool *tool, int fd, st
 }
 #endif
 
-int perf_event__synthesize_build_id(const struct perf_tool *tool, struct dso *pos, u16 misc,
-				    perf_event__handler_t process, struct machine *machine)
+int perf_event__synthesize_build_id(const struct perf_tool *tool,
+				    struct perf_sample *sample,
+				    struct machine *machine,
+				    perf_event__handler_t process,
+				    const struct evsel *evsel,
+				    __u16 misc,
+				    const struct build_id *bid,
+				    const char *filename)
 {
 	union perf_event ev;
 	size_t len;
 
-	if (!dso__hit(pos))
-		return 0;
+	len = sizeof(ev.build_id) + strlen(filename) + 1;
+	len = PERF_ALIGN(len, sizeof(u64));
 
-	memset(&ev, 0, sizeof(ev));
+	memset(&ev, 0, len);
 
-	len = dso__long_name_len(pos) + 1;
-	len = PERF_ALIGN(len, NAME_ALIGN);
-	ev.build_id.size = min(dso__bid(pos)->size, sizeof(dso__bid(pos)->data));
-	memcpy(&ev.build_id.build_id, dso__bid(pos)->data, ev.build_id.size);
+	ev.build_id.size = min(bid->size, sizeof(ev.build_id.build_id));
+	memcpy(ev.build_id.build_id, bid->data, ev.build_id.size);
 	ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID;
 	ev.build_id.header.misc = misc | PERF_RECORD_MISC_BUILD_ID_SIZE;
 	ev.build_id.pid = machine->pid;
-	ev.build_id.header.size = sizeof(ev.build_id) + len;
-	memcpy(&ev.build_id.filename, dso__long_name(pos), dso__long_name_len(pos));
+	ev.build_id.header.size = len;
+	strcpy(ev.build_id.filename, filename);
+
+	if (evsel) {
+		void *array = &ev;
+		int ret;
 
-	return process(tool, &ev, NULL, machine);
+		array += ev.header.size;
+		ret = perf_event__synthesize_id_sample(array, evsel->core.attr.sample_type, sample);
+		if (ret < 0)
+			return ret;
+
+		if (ret & 7) {
+			pr_err("Bad id sample size %d\n", ret);
+			return -EINVAL;
+		}
+
+		ev.header.size += ret;
+	}
+	return process(tool, &ev, sample, machine);
 }
 
 int perf_event__synthesize_stat_events(struct perf_stat_config *config, const struct perf_tool *tool,
diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index 31df7653677f..795bf3e18396 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h
@@ -9,6 +9,7 @@
 #include <perf/cpumap.h>
 
 struct auxtrace_record;
+struct build_id;
 struct dso;
 struct evlist;
 struct evsel;
@@ -45,7 +46,14 @@ typedef int (*perf_event__handler_t)(const struct perf_tool *tool, union perf_ev
 
 int perf_event__synthesize_attrs(const struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process);
 int perf_event__synthesize_attr(const struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process);
-int perf_event__synthesize_build_id(const struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_build_id(const struct perf_tool *tool,
+				    struct perf_sample *sample,
+				    struct machine *machine,
+				    perf_event__handler_t process,
+				    const struct evsel *evsel,
+				    __u16 misc,
+				    const struct build_id *bid,
+				    const char *filename);
 int perf_event__synthesize_cpu_map(const struct perf_tool *tool, const struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_event_update_cpus(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
 int perf_event__synthesize_event_update_name(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
-- 
cgit v1.2.3


From d762ba020d29ddb676805a90bf1450921817418f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Sep 2024 13:37:38 -0700
Subject: perf inject: Add new mmap2-buildid-all option

Add an option that allows all mmap or mmap2 events to be rewritten as
mmap2 events with build IDs.

This is similar to the existing -b/--build-ids and --buildid-all options
except instead of adding a build_id event an existing mmap/mmap2 event
is used as a template and a new mmap2 event synthesized from it.

As mmap2 events are typical this avoids the insertion of build_id
events.

Add test coverage to the pipe test.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Link: https://lore.kernel.org/r/20240909203740.143492-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c         | 88 +++++++++++++++++++++++++++++++++++--
 tools/perf/tests/shell/pipe_test.sh |  1 +
 tools/perf/util/synthetic-events.c  | 57 ++++++++++++++++++++++++
 tools/perf/util/synthetic-events.h  | 11 +++++
 4 files changed, 154 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 24470c57527d..5a27fa46e93d 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -107,6 +107,7 @@ enum build_id_rewrite_style {
 	BID_RWS__NONE = 0,
 	BID_RWS__INJECT_HEADER_LAZY,
 	BID_RWS__INJECT_HEADER_ALL,
+	BID_RWS__MMAP2_BUILDID_ALL,
 };
 
 struct perf_inject {
@@ -146,6 +147,16 @@ static int tool__inject_build_id(const struct perf_tool *tool,
 				 __u16 misc,
 				 const char *filename,
 				 struct dso *dso, u32 flags);
+static int tool__inject_mmap2_build_id(const struct perf_tool *tool,
+				      struct perf_sample *sample,
+				      struct machine *machine,
+				      const struct evsel *evsel,
+				      __u16 misc,
+				      __u32 pid, __u32 tid,
+				      __u64 start, __u64 len, __u64 pgoff,
+				      struct dso *dso,
+				      __u32 prot, __u32 flags,
+				      const char *filename);
 
 static int output_bytes(struct perf_inject *inject, void *buf, size_t sz)
 {
@@ -161,6 +172,7 @@ static int output_bytes(struct perf_inject *inject, void *buf, size_t sz)
 
 static int perf_event__repipe_synth(const struct perf_tool *tool,
 				    union perf_event *event)
+
 {
 	struct perf_inject *inject = container_of(tool, struct perf_inject,
 						  tool);
@@ -454,7 +466,9 @@ static int perf_event__repipe_common_mmap(const struct perf_tool *tool,
 					  union perf_event *event,
 					  struct perf_sample *sample,
 					  struct machine *machine,
-					  __u32 pid, __u32 tid, __u32 flags,
+					  __u32 pid, __u32 tid,
+					  __u64 start, __u64 len, __u64 pgoff,
+					  __u32 flags, __u32 prot,
 					  const char *filename,
 					  const struct dso_id *dso_id,
 					  int (*perf_event_process)(const struct perf_tool *tool,
@@ -525,6 +539,26 @@ static int perf_event__repipe_common_mmap(const struct perf_tool *tool,
 			return err;
 		}
 	}
+	if ((inject->build_id_style == BID_RWS__MMAP2_BUILDID_ALL) &&
+	    !(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) {
+		struct evsel *evsel = evlist__event2evsel(inject->session->evlist, event);
+
+		if (evsel && !dso_sought) {
+			dso = findnew_dso(pid, tid, filename, dso_id, machine);
+			dso_sought = true;
+		}
+		if (evsel && dso &&
+		    !tool__inject_mmap2_build_id(tool, sample, machine, evsel,
+						 sample->cpumode | PERF_RECORD_MISC_MMAP_BUILD_ID,
+						 pid, tid, start, len, pgoff,
+						 dso,
+						 prot, flags,
+						 filename)) {
+			/* Injected mmap2 so no need to repipe. */
+			dso__put(dso);
+			return 0;
+		}
+	}
 	dso__put(dso);
 	return perf_event__repipe(tool, event, sample, machine);
 }
@@ -536,7 +570,9 @@ static int perf_event__repipe_mmap(const struct perf_tool *tool,
 {
 	return perf_event__repipe_common_mmap(
 		tool, event, sample, machine,
-		event->mmap.pid, event->mmap.tid, /*flags=*/0,
+		event->mmap.pid, event->mmap.tid,
+		event->mmap.start, event->mmap.len, event->mmap.pgoff,
+		/*flags=*/0, PROT_EXEC,
 		event->mmap.filename, /*dso_id=*/NULL,
 		perf_event__process_mmap);
 }
@@ -559,7 +595,9 @@ static int perf_event__repipe_mmap2(const struct perf_tool *tool,
 
 	return perf_event__repipe_common_mmap(
 		tool, event, sample, machine,
-		event->mmap2.pid, event->mmap2.tid, event->mmap2.flags,
+		event->mmap2.pid, event->mmap2.tid,
+		event->mmap2.start, event->mmap2.len, event->mmap2.pgoff,
+		event->mmap2.flags, event->mmap2.prot,
 		event->mmap2.filename, dso_id,
 		perf_event__process_mmap2);
 }
@@ -748,6 +786,45 @@ static int tool__inject_build_id(const struct perf_tool *tool,
 	return 0;
 }
 
+static int tool__inject_mmap2_build_id(const struct perf_tool *tool,
+				       struct perf_sample *sample,
+				       struct machine *machine,
+				       const struct evsel *evsel,
+				       __u16 misc,
+				       __u32 pid, __u32 tid,
+				       __u64 start, __u64 len, __u64 pgoff,
+				       struct dso *dso,
+				       __u32 prot, __u32 flags,
+				       const char *filename)
+{
+	int err;
+
+	/* Return to repipe anonymous maps. */
+	if (is_anon_memory(filename) || flags & MAP_HUGETLB)
+		return 1;
+	if (is_no_dso_memory(filename))
+		return 1;
+
+	if (dso__read_build_id(dso)) {
+		pr_debug("no build_id found for %s\n", filename);
+		return -1;
+	}
+
+	err = perf_event__synthesize_mmap2_build_id(tool, sample, machine,
+						    perf_event__repipe,
+						    evsel,
+						    misc, pid, tid,
+						    start, len, pgoff,
+						    dso__bid(dso),
+						    prot, flags,
+						    filename);
+	if (err) {
+		pr_err("Can't synthesize build_id event for %s\n", filename);
+		return -1;
+	}
+	return 0;
+}
+
 static int mark_dso_hit(const struct perf_tool *tool,
 			struct perf_sample *sample,
 			struct machine *machine,
@@ -2261,12 +2338,15 @@ int cmd_inject(int argc, const char **argv)
 	const char *known_build_ids = NULL;
 	bool build_ids;
 	bool build_id_all;
+	bool mmap2_build_id_all;
 
 	struct option options[] = {
 		OPT_BOOLEAN('b', "build-ids", &build_ids,
 			    "Inject build-ids into the output stream"),
 		OPT_BOOLEAN(0, "buildid-all", &build_id_all,
 			    "Inject build-ids of all DSOs into the output stream"),
+		OPT_BOOLEAN(0, "mmap2-buildid-all", &mmap2_build_id_all,
+			    "Rewrite all mmap events as mmap2 events with build IDs"),
 		OPT_STRING(0, "known-build-ids", &known_build_ids,
 			   "buildid path [,buildid path...]",
 			   "build-ids to use for given paths"),
@@ -2363,6 +2443,8 @@ int cmd_inject(int argc, const char **argv)
 			return -1;
 		}
 	}
+	if (mmap2_build_id_all)
+		inject.build_id_style = BID_RWS__MMAP2_BUILDID_ALL;
 	if (build_ids)
 		inject.build_id_style = BID_RWS__INJECT_HEADER_LAZY;
 	if (build_id_all)
diff --git a/tools/perf/tests/shell/pipe_test.sh b/tools/perf/tests/shell/pipe_test.sh
index a3c94b4182c2..250574cd68b6 100755
--- a/tools/perf/tests/shell/pipe_test.sh
+++ b/tools/perf/tests/shell/pipe_test.sh
@@ -118,6 +118,7 @@ test_inject_bids() {
 test_record_report
 test_inject_bids -b
 test_inject_bids --buildid-all
+test_inject_bids --mmap2-buildid-all
 
 cleanup
 exit $err
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 6bb62e4e2d5d..a58444c4aed1 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -2266,6 +2266,63 @@ int perf_event__synthesize_build_id(const struct perf_tool *tool,
 
 		ev.header.size += ret;
 	}
+
+	return process(tool, &ev, sample, machine);
+}
+
+int perf_event__synthesize_mmap2_build_id(const struct perf_tool *tool,
+					  struct perf_sample *sample,
+					  struct machine *machine,
+					  perf_event__handler_t process,
+					  const struct evsel *evsel,
+					  __u16 misc,
+					  __u32 pid, __u32 tid,
+					  __u64 start, __u64 len, __u64 pgoff,
+					  const struct build_id *bid,
+					  __u32 prot, __u32 flags,
+					  const char *filename)
+{
+	union perf_event ev;
+	size_t ev_len;
+	void *array;
+	int ret;
+
+	ev_len = sizeof(ev.mmap2) - sizeof(ev.mmap2.filename) + strlen(filename) + 1;
+	ev_len = PERF_ALIGN(ev_len, sizeof(u64));
+
+	memset(&ev, 0, ev_len);
+
+	ev.mmap2.header.type = PERF_RECORD_MMAP2;
+	ev.mmap2.header.misc = misc | PERF_RECORD_MISC_MMAP_BUILD_ID;
+	ev.mmap2.header.size = ev_len;
+
+	ev.mmap2.pid = pid;
+	ev.mmap2.tid = tid;
+	ev.mmap2.start = start;
+	ev.mmap2.len = len;
+	ev.mmap2.pgoff = pgoff;
+
+	ev.mmap2.build_id_size = min(bid->size, sizeof(ev.mmap2.build_id));
+	memcpy(ev.mmap2.build_id, bid->data, ev.mmap2.build_id_size);
+
+	ev.mmap2.prot = prot;
+	ev.mmap2.flags = flags;
+
+	memcpy(ev.mmap2.filename, filename, min(strlen(filename), sizeof(ev.mmap.filename)));
+
+	array = &ev;
+	array += ev.header.size;
+	ret = perf_event__synthesize_id_sample(array, evsel->core.attr.sample_type, sample);
+	if (ret < 0)
+		return ret;
+
+	if (ret & 7) {
+		pr_err("Bad id sample size %d\n", ret);
+		return -EINVAL;
+	}
+
+	ev.header.size += ret;
+
 	return process(tool, &ev, sample, machine);
 }
 
diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index 795bf3e18396..b9c936b5cfeb 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h
@@ -54,6 +54,17 @@ int perf_event__synthesize_build_id(const struct perf_tool *tool,
 				    __u16 misc,
 				    const struct build_id *bid,
 				    const char *filename);
+int perf_event__synthesize_mmap2_build_id(const struct perf_tool *tool,
+					  struct perf_sample *sample,
+					  struct machine *machine,
+					  perf_event__handler_t process,
+					  const struct evsel *evsel,
+					  __u16 misc,
+					  __u32 pid, __u32 tid,
+					  __u64 start, __u64 len, __u64 pgoff,
+					  const struct build_id *bid,
+					  __u32 prot, __u32 flags,
+					  const char *filename);
 int perf_event__synthesize_cpu_map(const struct perf_tool *tool, const struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_event_update_cpus(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
 int perf_event__synthesize_event_update_name(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
-- 
cgit v1.2.3


From 64eed019f3fce1248cd29443c8a8635d317cb70f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Sep 2024 13:37:39 -0700
Subject: perf inject: Lazy build-id mmap2 event insertion

Add -B option that lazily inserts mmap2 events thereby dropping all
mmap events without samples. This is similar to the behavior of -b
where only build_id events are inserted when a dso is accessed in a
sample.

File size savings can be significant in system-wide mode, consider:

  $ perf record -g -a -o perf.data sleep 1
  $ perf inject -B -i perf.data -o perf.new.data
  $ ls -al perf.data perf.new.data
           5147049 perf.data
           2248493 perf.new.data

Give test coverage of the new option in pipe test.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Link: https://lore.kernel.org/r/20240909203740.143492-4-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c         | 62 ++++++++++++++++++++++++++++++-------
 tools/perf/tests/shell/pipe_test.sh |  1 +
 tools/perf/util/map.c               |  1 +
 tools/perf/util/map.h               | 11 +++++++
 4 files changed, 63 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 5a27fa46e93d..9eb72ff48d88 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -108,6 +108,7 @@ enum build_id_rewrite_style {
 	BID_RWS__INJECT_HEADER_LAZY,
 	BID_RWS__INJECT_HEADER_ALL,
 	BID_RWS__MMAP2_BUILDID_ALL,
+	BID_RWS__MMAP2_BUILDID_LAZY,
 };
 
 struct perf_inject {
@@ -527,7 +528,8 @@ static int perf_event__repipe_common_mmap(const struct perf_tool *tool,
 		 * Remember the evsel for lazy build id generation. It is used
 		 * for the sample id header type.
 		 */
-		if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY &&
+		if ((inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY ||
+		     inject->build_id_style == BID_RWS__MMAP2_BUILDID_LAZY) &&
 		    !inject->mmap_evsel)
 			inject->mmap_evsel = evlist__event2evsel(inject->session->evlist, event);
 
@@ -560,6 +562,9 @@ static int perf_event__repipe_common_mmap(const struct perf_tool *tool,
 		}
 	}
 	dso__put(dso);
+	if (inject->build_id_style == BID_RWS__MMAP2_BUILDID_LAZY)
+		return 0;
+
 	return perf_event__repipe(tool, event, sample, machine);
 }
 
@@ -825,7 +830,8 @@ static int tool__inject_mmap2_build_id(const struct perf_tool *tool,
 	return 0;
 }
 
-static int mark_dso_hit(const struct perf_tool *tool,
+static int mark_dso_hit(const struct perf_inject *inject,
+			const struct perf_tool *tool,
 			struct perf_sample *sample,
 			struct machine *machine,
 			const struct evsel *mmap_evsel,
@@ -854,16 +860,39 @@ static int mark_dso_hit(const struct perf_tool *tool,
 		}
 	}
 	dso = map__dso(map);
-	if (dso && !dso__hit(dso)) {
-		dso__set_hit(dso);
-		tool__inject_build_id(tool, sample, machine,
-				      mmap_evsel, misc, dso__long_name(dso), dso,
-				      map__flags(map));
+	if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
+		if (dso && !dso__hit(dso)) {
+			dso__set_hit(dso);
+			tool__inject_build_id(tool, sample, machine,
+					     mmap_evsel, misc, dso__long_name(dso), dso,
+					     map__flags(map));
+		}
+	} else if (inject->build_id_style == BID_RWS__MMAP2_BUILDID_LAZY) {
+		if (!map__hit(map)) {
+			const struct build_id null_bid = { .size = 0 };
+			const struct build_id *bid = dso ? dso__bid(dso) : &null_bid;
+			const char *filename = dso ? dso__long_name(dso) : "";
+
+			map__set_hit(map);
+			perf_event__synthesize_mmap2_build_id(tool, sample, machine,
+								perf_event__repipe,
+								mmap_evsel,
+								misc,
+								sample->pid, sample->tid,
+								map__start(map),
+								map__end(map) - map__start(map),
+								map__pgoff(map),
+								bid,
+								map__prot(map),
+								map__flags(map),
+								filename);
+		}
 	}
 	return 0;
 }
 
 struct mark_dso_hit_args {
+	const struct perf_inject *inject;
 	const struct perf_tool *tool;
 	struct perf_sample *sample;
 	struct machine *machine;
@@ -875,7 +904,7 @@ static int mark_dso_hit_callback(struct callchain_cursor_node *node, void *data)
 	struct mark_dso_hit_args *args = data;
 	struct map *map = node->ms.map;
 
-	return mark_dso_hit(args->tool, args->sample, args->machine,
+	return mark_dso_hit(args->inject, args->tool, args->sample, args->machine,
 			    args->mmap_evsel, map, /*sample_in_dso=*/false);
 }
 
@@ -888,6 +917,7 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e
 	struct thread *thread;
 	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	struct mark_dso_hit_args args = {
+		.inject = inject,
 		.tool = tool,
 		/*
 		 * Use the parsed sample data of the sample event, which will
@@ -907,7 +937,7 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e
 	}
 
 	if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) {
-		mark_dso_hit(tool, sample, machine, args.mmap_evsel, al.map,
+		mark_dso_hit(inject, tool, sample, machine, args.mmap_evsel, al.map,
 			     /*sample_in_dso=*/true);
 	}
 
@@ -2155,7 +2185,8 @@ static int __cmd_inject(struct perf_inject *inject)
 #endif
 	}
 
-	if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
+	if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY ||
+	    inject->build_id_style == BID_RWS__MMAP2_BUILDID_LAZY) {
 		inject->tool.sample = perf_event__inject_buildid;
 	} else if (inject->sched_stat) {
 		struct evsel *evsel;
@@ -2338,6 +2369,7 @@ int cmd_inject(int argc, const char **argv)
 	const char *known_build_ids = NULL;
 	bool build_ids;
 	bool build_id_all;
+	bool mmap2_build_ids;
 	bool mmap2_build_id_all;
 
 	struct option options[] = {
@@ -2345,6 +2377,8 @@ int cmd_inject(int argc, const char **argv)
 			    "Inject build-ids into the output stream"),
 		OPT_BOOLEAN(0, "buildid-all", &build_id_all,
 			    "Inject build-ids of all DSOs into the output stream"),
+		OPT_BOOLEAN('B', "mmap2-buildids", &mmap2_build_ids,
+			    "Drop unused mmap events, make others mmap2 with build IDs"),
 		OPT_BOOLEAN(0, "mmap2-buildid-all", &mmap2_build_id_all,
 			    "Rewrite all mmap events as mmap2 events with build IDs"),
 		OPT_STRING(0, "known-build-ids", &known_build_ids,
@@ -2443,6 +2477,8 @@ int cmd_inject(int argc, const char **argv)
 			return -1;
 		}
 	}
+	if (mmap2_build_ids)
+		inject.build_id_style = BID_RWS__MMAP2_BUILDID_LAZY;
 	if (mmap2_build_id_all)
 		inject.build_id_style = BID_RWS__MMAP2_BUILDID_ALL;
 	if (build_ids)
@@ -2453,7 +2489,8 @@ int cmd_inject(int argc, const char **argv)
 	data.path = inject.input_name;
 
 	ordered_events = inject.jit_mode || inject.sched_stat ||
-		(inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY);
+		inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY ||
+		inject.build_id_style == BID_RWS__MMAP2_BUILDID_LAZY;
 	perf_tool__init(&inject.tool, ordered_events);
 	inject.tool.sample		= perf_event__repipe_sample;
 	inject.tool.read		= perf_event__repipe_sample;
@@ -2532,7 +2569,8 @@ int cmd_inject(int argc, const char **argv)
 		}
 	}
 
-	if (inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY) {
+	if (inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY ||
+	    inject.build_id_style == BID_RWS__MMAP2_BUILDID_LAZY) {
 		/*
 		 * to make sure the mmap records are ordered correctly
 		 * and so that the correct especially due to jitted code
diff --git a/tools/perf/tests/shell/pipe_test.sh b/tools/perf/tests/shell/pipe_test.sh
index 250574cd68b6..d4c8005ce9b9 100755
--- a/tools/perf/tests/shell/pipe_test.sh
+++ b/tools/perf/tests/shell/pipe_test.sh
@@ -116,6 +116,7 @@ test_inject_bids() {
 }
 
 test_record_report
+test_inject_bids -B
 test_inject_bids -b
 test_inject_bids --buildid-all
 test_inject_bids --mmap2-buildid-all
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index e781c8d56a9a..d729438b7d65 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -116,6 +116,7 @@ static void map__init(struct map *map, u64 start, u64 end, u64 pgoff,
 	map__set_mapping_type(map, MAPPING_TYPE__DSO);
 	assert(map__erange_warned(map) == false);
 	assert(map__priv(map) == false);
+	assert(map__hit(map) == false);
 }
 
 struct map *map__new(struct machine *machine, u64 start, u64 len,
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 6c43f31a9fe0..4262f5a143be 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -35,6 +35,7 @@ DECLARE_RC_STRUCT(map) {
 	enum mapping_type	mapping_type:8;
 	bool			erange_warned;
 	bool			priv;
+	bool			hit;
 };
 
 struct kmap;
@@ -83,6 +84,11 @@ static inline bool map__priv(const struct map *map)
 	return RC_CHK_ACCESS(map)->priv;
 }
 
+static inline bool map__hit(const struct map *map)
+{
+	return RC_CHK_ACCESS(map)->hit;
+}
+
 static inline refcount_t *map__refcnt(struct map *map)
 {
 	return &RC_CHK_ACCESS(map)->refcnt;
@@ -287,6 +293,11 @@ static inline void map__set_priv(struct map *map)
 	RC_CHK_ACCESS(map)->priv = true;
 }
 
+static inline void map__set_hit(struct map *map)
+{
+	RC_CHK_ACCESS(map)->hit = true;
+}
+
 static inline void map__set_erange_warned(struct map *map)
 {
 	RC_CHK_ACCESS(map)->erange_warned = true;
-- 
cgit v1.2.3


From 02b2705017d2e5ad32c23067d4b44477ec20b637 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Sep 2024 13:37:40 -0700
Subject: perf callchain: Allow symbols to be optional when resolving a
 callchain

In uses like 'perf inject' it is not necessary to gather the symbol for
each call chain location, the map for the sample IP is wanted so that
build IDs and the like can be injected. Make gathering the symbol in the
callchain_cursor optional.

For a 'perf inject -B' command this lowers the peak RSS from 54.1MB to
29.6MB by avoiding loading symbols.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anne Macedo <retpolanne@posteo.net>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Link: https://lore.kernel.org/r/20240909203740.143492-5-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c |  2 +-
 tools/perf/util/callchain.c |  8 ++--
 tools/perf/util/callchain.h |  2 +-
 tools/perf/util/machine.c   | 92 ++++++++++++++++++++++++++-------------------
 tools/perf/util/machine.h   | 33 ++++++++++++----
 5 files changed, 85 insertions(+), 52 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 9eb72ff48d88..d6989195a061 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -942,7 +942,7 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e
 	}
 
 	sample__for_each_callchain_node(thread, evsel, sample, PERF_MAX_STACK_DEPTH,
-					mark_dso_hit_callback, &args);
+					/*symbols=*/false, mark_dso_hit_callback, &args);
 
 	thread__put(thread);
 repipe:
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 0d608e875fe9..0c7564747a14 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1800,7 +1800,7 @@ s64 callchain_avg_cycles(struct callchain_node *cnode)
 
 int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
 				    struct perf_sample *sample, int max_stack,
-				    callchain_iter_fn cb, void *data)
+				    bool symbols, callchain_iter_fn cb, void *data)
 {
 	struct callchain_cursor *cursor = get_tls_callchain_cursor();
 	int ret;
@@ -1809,9 +1809,9 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
 		return -ENOMEM;
 
 	/* Fill in the callchain. */
-	ret = thread__resolve_callchain(thread, cursor, evsel, sample,
-					/*parent=*/NULL, /*root_al=*/NULL,
-					max_stack);
+	ret = __thread__resolve_callchain(thread, cursor, evsel, sample,
+					  /*parent=*/NULL, /*root_al=*/NULL,
+					  max_stack, symbols);
 	if (ret)
 		return ret;
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 76891f8e2373..86ed9e4d04f9 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -315,6 +315,6 @@ typedef int (*callchain_iter_fn)(struct callchain_cursor_node *node, void *data)
 
 int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
 				    struct perf_sample *sample, int max_stack,
-				    callchain_iter_fn cb, void *data);
+				    bool symbols, callchain_iter_fn cb, void *data);
 
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 5783b96fb988..fad227b625d1 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2060,7 +2060,8 @@ static int add_callchain_ip(struct thread *thread,
 			    bool branch,
 			    struct branch_flags *flags,
 			    struct iterations *iter,
-			    u64 branch_from)
+			    u64 branch_from,
+			    bool symbols)
 {
 	struct map_symbol ms = {};
 	struct addr_location al;
@@ -2099,7 +2100,8 @@ static int add_callchain_ip(struct thread *thread,
 			}
 			goto out;
 		}
-		thread__find_symbol(thread, *cpumode, ip, &al);
+		if (symbols)
+			thread__find_symbol(thread, *cpumode, ip, &al);
 	}
 
 	if (al.sym != NULL) {
@@ -2228,7 +2230,8 @@ static int lbr_callchain_add_kernel_ip(struct thread *thread,
 				       struct symbol **parent,
 				       struct addr_location *root_al,
 				       u64 branch_from,
-				       bool callee, int end)
+				       bool callee, int end,
+				       bool symbols)
 {
 	struct ip_callchain *chain = sample->callchain;
 	u8 cpumode = PERF_RECORD_MISC_USER;
@@ -2238,7 +2241,8 @@ static int lbr_callchain_add_kernel_ip(struct thread *thread,
 		for (i = 0; i < end + 1; i++) {
 			err = add_callchain_ip(thread, cursor, parent,
 					       root_al, &cpumode, chain->ips[i],
-					       false, NULL, NULL, branch_from);
+					       false, NULL, NULL, branch_from,
+					       symbols);
 			if (err)
 				return err;
 		}
@@ -2248,7 +2252,8 @@ static int lbr_callchain_add_kernel_ip(struct thread *thread,
 	for (i = end; i >= 0; i--) {
 		err = add_callchain_ip(thread, cursor, parent,
 				       root_al, &cpumode, chain->ips[i],
-				       false, NULL, NULL, branch_from);
+				       false, NULL, NULL, branch_from,
+				       symbols);
 		if (err)
 			return err;
 	}
@@ -2291,7 +2296,8 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 				    struct symbol **parent,
 				    struct addr_location *root_al,
 				    u64 *branch_from,
-				    bool callee)
+				    bool callee,
+				    bool symbols)
 {
 	struct branch_stack *lbr_stack = sample->branch_stack;
 	struct branch_entry *entries = perf_sample__branch_entries(sample);
@@ -2324,7 +2330,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 		err = add_callchain_ip(thread, cursor, parent,
 				       root_al, &cpumode, ip,
 				       true, flags, NULL,
-				       *branch_from);
+				       *branch_from, symbols);
 		if (err)
 			return err;
 
@@ -2349,7 +2355,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 			err = add_callchain_ip(thread, cursor, parent,
 					       root_al, &cpumode, ip,
 					       true, flags, NULL,
-					       *branch_from);
+					       *branch_from, symbols);
 			if (err)
 				return err;
 			save_lbr_cursor_node(thread, cursor, i);
@@ -2364,7 +2370,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 		err = add_callchain_ip(thread, cursor, parent,
 				       root_al, &cpumode, ip,
 				       true, flags, NULL,
-				       *branch_from);
+				       *branch_from, symbols);
 		if (err)
 			return err;
 		save_lbr_cursor_node(thread, cursor, i);
@@ -2378,7 +2384,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 		err = add_callchain_ip(thread, cursor, parent,
 				root_al, &cpumode, ip,
 				true, flags, NULL,
-				*branch_from);
+				*branch_from, symbols);
 		if (err)
 			return err;
 	}
@@ -2545,7 +2551,8 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 					struct symbol **parent,
 					struct addr_location *root_al,
 					int max_stack,
-					unsigned int max_lbr)
+					unsigned int max_lbr,
+					bool symbols)
 {
 	bool callee = (callchain_param.order == ORDER_CALLEE);
 	struct ip_callchain *chain = sample->callchain;
@@ -2587,12 +2594,12 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 		/* Add kernel ip */
 		err = lbr_callchain_add_kernel_ip(thread, cursor, sample,
 						  parent, root_al, branch_from,
-						  true, i);
+						  true, i, symbols);
 		if (err)
 			goto error;
 
 		err = lbr_callchain_add_lbr_ip(thread, cursor, sample, parent,
-					       root_al, &branch_from, true);
+					       root_al, &branch_from, true, symbols);
 		if (err)
 			goto error;
 
@@ -2609,14 +2616,14 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 				goto error;
 		}
 		err = lbr_callchain_add_lbr_ip(thread, cursor, sample, parent,
-					       root_al, &branch_from, false);
+					       root_al, &branch_from, false, symbols);
 		if (err)
 			goto error;
 
 		/* Add kernel ip */
 		err = lbr_callchain_add_kernel_ip(thread, cursor, sample,
 						  parent, root_al, branch_from,
-						  false, i);
+						  false, i, symbols);
 		if (err)
 			goto error;
 	}
@@ -2630,7 +2637,7 @@ static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread,
 			     struct callchain_cursor *cursor,
 			     struct symbol **parent,
 			     struct addr_location *root_al,
-			     u8 *cpumode, int ent)
+			     u8 *cpumode, int ent, bool symbols)
 {
 	int err = 0;
 
@@ -2640,7 +2647,7 @@ static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread,
 		if (ip >= PERF_CONTEXT_MAX) {
 			err = add_callchain_ip(thread, cursor, parent,
 					       root_al, cpumode, ip,
-					       false, NULL, NULL, 0);
+					       false, NULL, NULL, 0, symbols);
 			break;
 		}
 	}
@@ -2662,7 +2669,8 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 					    struct perf_sample *sample,
 					    struct symbol **parent,
 					    struct addr_location *root_al,
-					    int max_stack)
+					    int max_stack,
+					    bool symbols)
 {
 	struct branch_stack *branch = sample->branch_stack;
 	struct branch_entry *entries = perf_sample__branch_entries(sample);
@@ -2682,7 +2690,8 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 
 		err = resolve_lbr_callchain_sample(thread, cursor, sample, parent,
 						   root_al, max_stack,
-						   !env ? 0 : env->max_branches);
+						   !env ? 0 : env->max_branches,
+						   symbols);
 		if (err)
 			return (err < 0) ? err : 0;
 	}
@@ -2747,13 +2756,14 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 					       root_al,
 					       NULL, be[i].to,
 					       true, &be[i].flags,
-					       NULL, be[i].from);
+					       NULL, be[i].from, symbols);
 
-			if (!err)
+			if (!err) {
 				err = add_callchain_ip(thread, cursor, parent, root_al,
 						       NULL, be[i].from,
 						       true, &be[i].flags,
-						       &iter[i], 0);
+						       &iter[i], 0, symbols);
+			}
 			if (err == -EINVAL)
 				break;
 			if (err)
@@ -2769,7 +2779,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 check_calls:
 	if (chain && callchain_param.order != ORDER_CALLEE) {
 		err = find_prev_cpumode(chain, thread, cursor, parent, root_al,
-					&cpumode, chain->nr - first_call);
+					&cpumode, chain->nr - first_call, symbols);
 		if (err)
 			return (err < 0) ? err : 0;
 	}
@@ -2791,7 +2801,7 @@ check_calls:
                        ++nr_entries;
 		else if (callchain_param.order != ORDER_CALLEE) {
 			err = find_prev_cpumode(chain, thread, cursor, parent,
-						root_al, &cpumode, j);
+						root_al, &cpumode, j, symbols);
 			if (err)
 				return (err < 0) ? err : 0;
 			continue;
@@ -2818,8 +2828,8 @@ check_calls:
 			if (leaf_frame_caller && leaf_frame_caller != ip) {
 
 				err = add_callchain_ip(thread, cursor, parent,
-					       root_al, &cpumode, leaf_frame_caller,
-					       false, NULL, NULL, 0);
+						root_al, &cpumode, leaf_frame_caller,
+						false, NULL, NULL, 0, symbols);
 				if (err)
 					return (err < 0) ? err : 0;
 			}
@@ -2827,7 +2837,7 @@ check_calls:
 
 		err = add_callchain_ip(thread, cursor, parent,
 				       root_al, &cpumode, ip,
-				       false, NULL, NULL, 0);
+				       false, NULL, NULL, 0, symbols);
 
 		if (err)
 			return (err < 0) ? err : 0;
@@ -2907,7 +2917,7 @@ static int thread__resolve_callchain_unwind(struct thread *thread,
 					    struct callchain_cursor *cursor,
 					    struct evsel *evsel,
 					    struct perf_sample *sample,
-					    int max_stack)
+					    int max_stack, bool symbols)
 {
 	/* Can we do dwarf post unwind? */
 	if (!((evsel->core.attr.sample_type & PERF_SAMPLE_REGS_USER) &&
@@ -2919,17 +2929,21 @@ static int thread__resolve_callchain_unwind(struct thread *thread,
 	    (!sample->user_stack.size))
 		return 0;
 
+	if (!symbols)
+		pr_debug("Not resolving symbols with an unwinder isn't currently supported\n");
+
 	return unwind__get_entries(unwind_entry, cursor,
 				   thread, sample, max_stack, false);
 }
 
-int thread__resolve_callchain(struct thread *thread,
-			      struct callchain_cursor *cursor,
-			      struct evsel *evsel,
-			      struct perf_sample *sample,
-			      struct symbol **parent,
-			      struct addr_location *root_al,
-			      int max_stack)
+int __thread__resolve_callchain(struct thread *thread,
+				struct callchain_cursor *cursor,
+				struct evsel *evsel,
+				struct perf_sample *sample,
+				struct symbol **parent,
+				struct addr_location *root_al,
+				int max_stack,
+				bool symbols)
 {
 	int ret = 0;
 
@@ -2942,22 +2956,22 @@ int thread__resolve_callchain(struct thread *thread,
 		ret = thread__resolve_callchain_sample(thread, cursor,
 						       evsel, sample,
 						       parent, root_al,
-						       max_stack);
+						       max_stack, symbols);
 		if (ret)
 			return ret;
 		ret = thread__resolve_callchain_unwind(thread, cursor,
 						       evsel, sample,
-						       max_stack);
+						       max_stack, symbols);
 	} else {
 		ret = thread__resolve_callchain_unwind(thread, cursor,
 						       evsel, sample,
-						       max_stack);
+						       max_stack, symbols);
 		if (ret)
 			return ret;
 		ret = thread__resolve_callchain_sample(thread, cursor,
 						       evsel, sample,
 						       parent, root_al,
-						       max_stack);
+						       max_stack, symbols);
 	}
 
 	return ret;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index a687876e3453..2e5a4cb342d8 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -178,13 +178,32 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample,
 
 struct callchain_cursor;
 
-int thread__resolve_callchain(struct thread *thread,
-			      struct callchain_cursor *cursor,
-			      struct evsel *evsel,
-			      struct perf_sample *sample,
-			      struct symbol **parent,
-			      struct addr_location *root_al,
-			      int max_stack);
+int __thread__resolve_callchain(struct thread *thread,
+				struct callchain_cursor *cursor,
+				struct evsel *evsel,
+				struct perf_sample *sample,
+				struct symbol **parent,
+				struct addr_location *root_al,
+				int max_stack,
+				bool symbols);
+
+static inline int thread__resolve_callchain(struct thread *thread,
+					    struct callchain_cursor *cursor,
+					    struct evsel *evsel,
+					    struct perf_sample *sample,
+					    struct symbol **parent,
+					    struct addr_location *root_al,
+					    int max_stack)
+{
+	return __thread__resolve_callchain(thread,
+					   cursor,
+					   evsel,
+					   sample,
+					   parent,
+					   root_al,
+					   max_stack,
+					   /*symbols=*/true);
+}
 
 /*
  * Default guest kernel is defined by parameter --guestkallsyms
-- 
cgit v1.2.3


From e8a63d473b49011a68a748aea1c8aefa046ebacf Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@linux.dev>
Date: Fri, 6 Sep 2024 17:07:43 -0400
Subject: selftests: net: csum: Fix checksums for packets with non-zero padding

Padding is not included in UDP and TCP checksums. Therefore, reduce the
length of the checksummed data to include only the data in the IP
payload. This fixes spurious reported checksum failures like

rx: pkt: sport=33000 len=26 csum=0xc850 verify=0xf9fe
pkt: bad csum

Technically it is possible for there to be trailing bytes after the UDP
data but before the Ethernet padding (e.g. if sizeof(ip) + sizeof(udp) +
udp.len < ip.len). However, we don't generate such packets.

Fixes: 91a7de85600d ("selftests/net: add csum offload test")
Signed-off-by: Sean Anderson <sean.anderson@linux.dev>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240906210743.627413-1-sean.anderson@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/csum.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c
index b9f3fc3c3426..e0a34e5e8dd5 100644
--- a/tools/testing/selftests/net/lib/csum.c
+++ b/tools/testing/selftests/net/lib/csum.c
@@ -654,10 +654,16 @@ static int recv_verify_packet_ipv4(void *nh, int len)
 {
 	struct iphdr *iph = nh;
 	uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto;
+	uint16_t ip_len;
 
 	if (len < sizeof(*iph) || iph->protocol != proto)
 		return -1;
 
+	ip_len = ntohs(iph->tot_len);
+	if (ip_len > len || ip_len < sizeof(*iph))
+		return -1;
+
+	len = ip_len;
 	iph_addr_p = &iph->saddr;
 	if (proto == IPPROTO_TCP)
 		return recv_verify_packet_tcp(iph + 1, len - sizeof(*iph));
@@ -669,16 +675,22 @@ static int recv_verify_packet_ipv6(void *nh, int len)
 {
 	struct ipv6hdr *ip6h = nh;
 	uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto;
+	uint16_t ip_len;
 
 	if (len < sizeof(*ip6h) || ip6h->nexthdr != proto)
 		return -1;
 
+	ip_len = ntohs(ip6h->payload_len);
+	if (ip_len > len - sizeof(*ip6h))
+		return -1;
+
+	len = ip_len;
 	iph_addr_p = &ip6h->saddr;
 
 	if (proto == IPPROTO_TCP)
-		return recv_verify_packet_tcp(ip6h + 1, len - sizeof(*ip6h));
+		return recv_verify_packet_tcp(ip6h + 1, len);
 	else
-		return recv_verify_packet_udp(ip6h + 1, len - sizeof(*ip6h));
+		return recv_verify_packet_udp(ip6h + 1, len);
 }
 
 /* return whether auxdata includes TP_STATUS_CSUM_VALID */
-- 
cgit v1.2.3


From fffe8efd689f29553f59dc6b3ce2867307ad37a2 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 9 Sep 2024 09:56:12 +0800
Subject: net-timestamp: add selftests for SOF_TIMESTAMPING_OPT_RX_FILTER

Test a few possible cases where we use SOF_TIMESTAMPING_OPT_RX_FILTER
with software or hardware report/generation flag.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240909015612.3856-3-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/rxtimestamp.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c
index 9eb42570294d..16ac4df55fdb 100644
--- a/tools/testing/selftests/net/rxtimestamp.c
+++ b/tools/testing/selftests/net/rxtimestamp.c
@@ -57,6 +57,8 @@ static struct sof_flag sof_flags[] = {
 	SOF_FLAG(SOF_TIMESTAMPING_SOFTWARE),
 	SOF_FLAG(SOF_TIMESTAMPING_RX_SOFTWARE),
 	SOF_FLAG(SOF_TIMESTAMPING_RX_HARDWARE),
+	SOF_FLAG(SOF_TIMESTAMPING_OPT_RX_FILTER),
+	SOF_FLAG(SOF_TIMESTAMPING_RAW_HARDWARE),
 };
 
 static struct socket_type socket_types[] = {
@@ -97,6 +99,22 @@ static struct test_case test_cases[] = {
 			| SOF_TIMESTAMPING_RX_HARDWARE },
 		{}
 	},
+	{
+		{ .so_timestamping = SOF_TIMESTAMPING_RAW_HARDWARE
+			| SOF_TIMESTAMPING_OPT_RX_FILTER },
+		{}
+	},
+	{
+		{ .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
+			| SOF_TIMESTAMPING_OPT_RX_FILTER },
+		{}
+	},
+	{
+		{ .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
+			| SOF_TIMESTAMPING_RX_SOFTWARE
+			| SOF_TIMESTAMPING_OPT_RX_FILTER },
+		{ .swtstamp = true }
+	},
 	{
 		{ .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
 			| SOF_TIMESTAMPING_RX_SOFTWARE },
-- 
cgit v1.2.3


From e37b315c17df21c3906e4798b3531c5b6d1da489 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Tue, 10 Sep 2024 08:55:22 +0800
Subject: perf parse-events: Remove duplicated include in parse-events.c

The header files parse-events.h is included twice in parse-events.c,
so one inclusion of each can be removed.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=10822
Link: https://lore.kernel.org/r/20240910005522.35994-1-yang.lee@linux.alibaba.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/parse-events.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 639e65a9bf61..e228511c85a5 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -5,7 +5,6 @@
 #include <api/fs/fs.h>
 #include "tests.h"
 #include "debug.h"
-#include "parse-events.h"
 #include "pmu.h"
 #include "pmus.h"
 #include <dirent.h>
-- 
cgit v1.2.3


From 4c1af9bf97eb56d069421c3233ce61608458d5c8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Sep 2024 13:54:23 -0300
Subject: perf trace: If a syscall arg is marked as 'const', assume it is
 coming _from_ userspace

We need to decide where to copy syscall arg contents, if at the
syscalls:sys_entry hook, meaning is something that is coming from
user to kernel space, or if it is a response, i.e. if it is something
the _kernel_ is filling in and thus going to userspace.

Since we have 'const' used in those syscalls, and unsure about this
being consistent, doing:

  root@number:~# echo $(grep const /sys/kernel/tracing/events/syscalls/sys_enter_*/format  | grep struct | cut -c47- | cut -d'/' -f1)
  clock_nanosleep clock_settime epoll_pwait2 futex io_pgetevents landlock_create_ruleset listmount mq_getsetattr mq_notify mq_timedreceive mq_timedsend preadv2 preadv prlimit64 process_madvise process_vm_readv process_vm_readv process_vm_writev process_vm_writev pwritev2 pwritev readv rt_sigaction rt_sigtimedwait semtimedop statmount timerfd_settime timer_settime vmsplice writev
  root@number:~#

Seems to indicate that we can use that for the ones that have the
'const' to mark it as coming from user space, do it.

Most notable/frequent syscall that now gets BTF pretty printed in a
system wide 'perf trace' session is:

  root@number:~# perf trace
     21.160 (         ): MediaSu~isor #/1028597 futex(uaddr: 0x7f49e1dfe964, op: WAIT_BITSET|PRIVATE_FLAG, utime: (struct __kernel_timespec){.tv_sec = (__kernel_time64_t)50290,.tv_nsec = (long long int)810362837,}, val3: MATCH_ANY) ...
      21.166 ( 0.000 ms): RemVidChild/6995 futex(uaddr: 0x7f49fcc7fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
      21.169 ( 0.001 ms): RemVidChild/6995 sendmsg(fd: 25<socket:[78915]>, msg: 0x7f49e9af9da0, flags: DONTWAIT) = 280
      21.172 ( 0.289 ms): RemVidChild/6995 futex(uaddr: 0x7f49fcc7fa58, op: WAIT_BITSET|PRIVATE_FLAG|CLOCK_REALTIME, val3: MATCH_ANY) = 0
      21.463 ( 0.000 ms): RemVidChild/6995 futex(uaddr: 0x7f49fcc7fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
      21.467 ( 0.001 ms): RemVidChild/6995 futex(uaddr: 0x7f49e28bb964, op: WAKE|PRIVATE_FLAG, val: 1)           = 1
      21.160 ( 0.314 ms): MediaSu~isor #/1028597  ... [continued]: futex())                                            = 0
      21.469 (         ): RemVidChild/6995 futex(uaddr: 0x7f49fcc7fa5c, op: WAIT_BITSET|PRIVATE_FLAG|CLOCK_REALTIME, val3: MATCH_ANY) ...
      21.475 ( 0.000 ms): MediaSu~isor #/1028597 futex(uaddr: 0x7f49d0223040, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
      21.478 ( 0.001 ms): MediaSu~isor #/1028597 futex(uaddr: 0x7f49e26ac964, op: WAKE|PRIVATE_FLAG, val: 1)           = 1
  ^Croot@number:~#
  root@number:~# cat /sys/kernel/tracing/events/syscalls/sys_enter_futex/format
  name: sys_enter_futex
  ID: 454
  format:
  	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
  	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
  	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
  	field:int common_pid;	offset:4;	size:4;	signed:1;

  	field:int __syscall_nr;	offset:8;	size:4;	signed:1;
  	field:u32 * uaddr;	offset:16;	size:8;	signed:0;
  	field:int op;	offset:24;	size:8;	signed:0;
  	field:u32 val;	offset:32;	size:8;	signed:0;
  	field:const struct __kernel_timespec * utime;	offset:40;	size:8;	signed:0;
  	field:u32 * uaddr2;	offset:48;	size:8;	signed:0;
  	field:u32 val3;	offset:56;	size:8;	signed:0;

  print fmt: "uaddr: 0x%08lx, op: 0x%08lx, val: 0x%08lx, utime: 0x%08lx, uaddr2: 0x%08lx, val3: 0x%08lx", ((unsigned long)(REC->uaddr)), ((unsigned long)(REC->op)), ((unsigned long)(REC->val)), ((unsigned long)(REC->utime)), ((unsigned long)(REC->uaddr2)), ((unsigned long)(REC->val3))
  root@number:~#

Suggested-by: Ian Rogers <irogers@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/CAP-5=fWnuQrrBoTn6Rrn6vM_xQ2fCoc9i-AitD7abTcNi-4o1Q@mail.gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 838e179a2573..3b88ba50bb80 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2001,11 +2001,14 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 
 		len = strlen(field->name);
 
+		// As far as heuristics (or intention) goes this seems to hold true, and makes sense!
+		if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const "))
+			arg->from_user = true;
+
 		if (strcmp(field->type, "const char *") == 0 &&
 		    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
 		     strstr(field->name, "path") != NULL)) {
 			arg->scnprintf = SCA_FILENAME;
-			arg->from_user = true;
 		} else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
 			arg->scnprintf = SCA_PTR;
 		else if (strcmp(field->type, "pid_t") == 0)
-- 
cgit v1.2.3


From 9327f0ecad48517d4f6f9dfa7b194d0cf53092ea Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Sep 2024 16:44:21 -0300
Subject: perf build: Require at least clang 16.0.6 to build BPF skeletons

Howard reported problems using perf features that use BPF:

  perf $ clang -v
  Debian clang version 15.0.6
  Target: x86_64-pc-linux-gnu
  Thread model: posix
  InstalledDir: /bin
  Found candidate GCC installation: /bin/../lib/gcc/x86_64-linux-gnu/12
  Selected GCC installation: /bin/../lib/gcc/x86_64-linux-gnu/12
  Candidate multilib: .;@m64
  Selected multilib: .;@m64
  perf $ ./perf trace -e write --max-events=1
  libbpf: prog 'sys_enter_rename': BPF program load failed: Permission denied
  libbpf: prog 'sys_enter_rename': -- BEGIN PROG LOAD LOG --
  0: R1=ctx() R10=fp0

But it works with:

  perf $ clang -v
  Debian clang version 16.0.6 (15~deb12u1)
  Target: x86_64-pc-linux-gnu
  Thread model: posix
  InstalledDir: /bin
  Found candidate GCC installation: /bin/../lib/gcc/x86_64-linux-gnu/12
  Selected GCC installation: /bin/../lib/gcc/x86_64-linux-gnu/12
  Candidate multilib: .;@m64
  Selected multilib: .;@m64
  perf $ ./perf trace -e write --max-events=1
       0.000 ( 0.009 ms): gmain/1448 write(fd: 4, buf: \1\0\0\0\0\0\0\0, count: 8)                         = 8 (kworker/0:0-eve)
  perf $

So lets make that the required version, if you happen to have a slightly
older version where this work, please report so that we can adjust the
minimum required version.

Reported-by: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZuGL9ROeTV2uXoSp@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 37e3eee2986e..4c1f75e326e2 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -705,8 +705,8 @@ ifeq ($(BUILD_BPF_SKEL),1)
     BUILD_BPF_SKEL := 0
   else
     CLANG_VERSION := $(shell $(CLANG) --version | head -1 | sed 's/.*clang version \([[:digit:]]\+.[[:digit:]]\+.[[:digit:]]\+\).*/\1/g')
-    ifeq ($(call version-lt3,$(CLANG_VERSION),12.0.1),1)
-      $(warning Warning: Disabled BPF skeletons as reliable BTF generation needs at least $(CLANG) version 12.0.1)
+    ifeq ($(call version-lt3,$(CLANG_VERSION),16.0.6),1)
+      $(warning Warning: Disabled BPF skeletons as at least $(CLANG) version 16.0.6 is reported to be a working setup with the current of BPF based perf features)
       BUILD_BPF_SKEL := 0
     endif
   endif
-- 
cgit v1.2.3


From 83420d5f586344f3781b35f5d78b67aea55bff2b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Sep 2024 17:18:26 -0300
Subject: perf test shell probe_vfs_getname: Remove extraneous '=' from probe
 line number regex

Thomas reported the vfs_getname perf tests failing on s/390, it seems it
was just to some extraneous '=' somehow getting into the regexp, remove
it, now:

  root@x1:~# perf test getname
   91: Add vfs_getname probe to get syscall args filenames             : Ok
   93: Use vfs_getname probe to get syscall args filenames             : FAILED!
  126: Check open filename arg using perf trace + vfs_getname          : Ok
  root@x1:~#

Second one remains a mistery, have to take some time to nail it down.

Reported-by: Thomas Richter <tmricht@linux.ibm.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Vasily Gorbik <gor@linux.ibm.com>,
Link: https://lore.kernel.org/lkml/1d7f3b7b-9edc-4d90-955c-9345428563f1@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/lib/probe_vfs_getname.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/lib/probe_vfs_getname.sh b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
index 0606e693eb59..5c33ec7a5a63 100644
--- a/tools/perf/tests/shell/lib/probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
@@ -14,10 +14,10 @@ add_probe_vfs_getname() {
 	add_probe_verbose=$1
 	if [ $had_vfs_getname -eq 1 ] ; then
 		result_filename_re="[[:space:]]+([[:digit:]]+)[[:space:]]+result->uptr.*"
-		line=$(perf probe -L getname_flags 2>&1 | grep -E "$result_filename_re=" | sed -r "s/$result_filename_re=/\1/")
+		line=$(perf probe -L getname_flags 2>&1 | grep -E "$result_filename_re" | sed -r "s/$result_filename_re/\1/")
 		if [ -z "$line" ] ; then
 			result_aname_re="[[:space:]]+([[:digit:]]+)[[:space:]]+result->aname = NULL;"
-			line=$(perf probe -L getname_flags 2>&1 | grep -E "$result_aname_re=" | sed -r "s/$result_aname_re=/\1/")
+			line=$(perf probe -L getname_flags 2>&1 | grep -E "$result_aname_re" | sed -r "s/$result_aname_re/\1/")
 		fi
 		perf probe -q       "vfs_getname=getname_flags:${line} pathname=result->name:string" || \
 		perf probe $add_probe_verbose "vfs_getname=getname_flags:${line} pathname=filename:ustring"
-- 
cgit v1.2.3


From 74298dd8acb875dc9b0486437945e50811b31794 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@huawei.com>
Date: Wed, 11 Sep 2024 18:01:26 +0800
Subject: perf ftrace: Detect whether ftrace is enabled on system
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To make error messages more accurate, this change detects whether ftrace is
enabled on system by checking trace file "set_ftrace_pid".

Before:

  # perf ftrace
  failed to reset ftrace
  #

After:

  # perf ftrace
  ftrace is not supported on this system
  #

Committer testing:

Doing it in an unprivileged toolbox container on Fedora 40:

Before:

  acme@number:~/git/perf-tools-next$ toolbox enter perf
  ⬢[acme@toolbox perf-tools-next]$ sudo su -
  ⬢[root@toolbox ~]# ~acme/bin/perf ftrace
  failed to reset ftrace
  ⬢[root@toolbox ~]#

After this patch:

  ⬢[root@toolbox ~]# ~acme/bin/perf ftrace
  ftrace is not supported on this system
  ⬢[root@toolbox ~]#

Maybe we could check if we are in such as situation, inside an
unprivileged container, and provide a HINT line?

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Changbin Du <changbin.du@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240911100126.900779-1-changbin.du@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-ftrace.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 88a87bf387d2..abcdc49b7a98 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -80,6 +80,24 @@ static bool check_ftrace_capable(void)
 	return false;
 }
 
+static bool is_ftrace_supported(void)
+{
+	char *file;
+	bool supported = false;
+
+	file = get_tracing_file("set_ftrace_pid");
+	if (!file) {
+		pr_debug("cannot get tracing file set_ftrace_pid\n");
+		return false;
+	}
+
+	if (!access(file, F_OK))
+		supported = true;
+
+	put_tracing_file(file);
+	return supported;
+}
+
 static int __write_tracing_file(const char *name, const char *val, bool append)
 {
 	char *file;
@@ -1583,6 +1601,11 @@ int cmd_ftrace(int argc, const char **argv)
 	if (!check_ftrace_capable())
 		return -1;
 
+	if (!is_ftrace_supported()) {
+		pr_err("ftrace is not supported on this system\n");
+		return -ENOTSUP;
+	}
+
 	ret = perf_config(perf_ftrace_config, &ftrace);
 	if (ret < 0)
 		return -1;
-- 
cgit v1.2.3


From d41905b3bb890a32c87b65228b241daad8837fb0 Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 10 Sep 2024 14:41:29 +0200
Subject: selftests/xsk: Read current MAX_SKB_FRAGS from sysctl knob

Currently, xskxceiver assumes that MAX_SKB_FRAGS value is always 17
which is not true - since the introduction of BIG TCP this can now take
any value between 17 to 45 via CONFIG_MAX_SKB_FRAGS.

Adjust the TOO_MANY_FRAGS test case to read the currently configured
MAX_SKB_FRAGS value by reading it from /proc/sys/net/core/max_skb_frags.
If running system does not provide that sysctl file then let us try
running the test with a default value.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20240910124129.289874-1-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xskxceiver.c | 43 ++++++++++++++++++++++++++++----
 tools/testing/selftests/bpf/xskxceiver.h |  1 -
 2 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 8144fd145237..1ee0ef114f9d 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -324,6 +324,25 @@ out:
 	return zc_avail;
 }
 
+#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags"
+static unsigned int get_max_skb_frags(void)
+{
+	unsigned int max_skb_frags = 0;
+	FILE *file;
+
+	file = fopen(MAX_SKB_FRAGS_PATH, "r");
+	if (!file) {
+		ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH);
+		return 0;
+	}
+
+	if (fscanf(file, "%u", &max_skb_frags) != 1)
+		ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH);
+
+	fclose(file);
+	return max_skb_frags;
+}
+
 static struct option long_options[] = {
 	{"interface", required_argument, 0, 'i'},
 	{"busy-poll", no_argument, 0, 'b'},
@@ -2244,13 +2263,24 @@ static int testapp_poll_rxq_tmout(struct test_spec *test)
 
 static int testapp_too_many_frags(struct test_spec *test)
 {
-	struct pkt pkts[2 * XSK_DESC__MAX_SKB_FRAGS + 2] = {};
+	struct pkt *pkts;
 	u32 max_frags, i;
+	int ret;
 
-	if (test->mode == TEST_MODE_ZC)
+	if (test->mode == TEST_MODE_ZC) {
 		max_frags = test->ifobj_tx->xdp_zc_max_segs;
-	else
-		max_frags = XSK_DESC__MAX_SKB_FRAGS;
+	} else {
+		max_frags = get_max_skb_frags();
+		if (!max_frags) {
+			ksft_print_msg("Couldn't retrieve MAX_SKB_FRAGS from system, using default (17) value\n");
+			max_frags = 17;
+		}
+		max_frags += 1;
+	}
+
+	pkts = calloc(2 * max_frags + 2, sizeof(struct pkt));
+	if (!pkts)
+		return TEST_FAILURE;
 
 	test->mtu = MAX_ETH_JUMBO_SIZE;
 
@@ -2280,7 +2310,10 @@ static int testapp_too_many_frags(struct test_spec *test)
 	pkts[2 * max_frags + 1].valid = true;
 
 	pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2);
-	return testapp_validate_traffic(test);
+	ret = testapp_validate_traffic(test);
+
+	free(pkts);
+	return ret;
 }
 
 static int xsk_load_xdp_programs(struct ifobject *ifobj)
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index 885c948c5d83..e46e823f6a1a 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -55,7 +55,6 @@
 #define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024)
 #define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024)
 #define XSK_DESC__INVALID_OPTION (0xffff)
-#define XSK_DESC__MAX_SKB_FRAGS 18
 #define HUGEPAGE_SIZE (2 * 1024 * 1024)
 #define PKT_DUMP_NB_TO_PRINT 16
 #define RUN_ALL_TESTS UINT_MAX
-- 
cgit v1.2.3


From 4ae354d73a8e16b925f1fd1542105d6889eff7c1 Mon Sep 17 00:00:00 2001
From: Andrew Kreimer <algonell@gmail.com>
Date: Sat, 7 Sep 2024 16:10:01 +0300
Subject: perf help: Fix a typo ("bellow")

Fix a typo in comments.

Reported-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Andrew Kreimer <algonell@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-janitors@vger.kernel.org
Link: https://lore.kernel.org/r/20240907131006.18510-1-algonell@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-help.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index b2a368ae295a..0854d3cd9f6a 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -417,7 +417,7 @@ static void open_html(const char *path)
 static int show_html_page(const char *perf_cmd)
 {
 	const char *page = cmd_to_page(perf_cmd);
-	char *page_path; /* it leaks but we exec bellow */
+	char *page_path; /* it leaks but we exec below */
 
 	if (get_html_page_path(&page_path, page) < 0)
 		return -1;
-- 
cgit v1.2.3


From d3d5c1a00fcdbae92456a6e78a7d440880fff18a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 6 Sep 2024 22:08:16 -0700
Subject: perf list: Avoid potential out of bounds memory read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a desc string is 0 length then -1 will be out of bounds, add a
check.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Clément Le Goffic <clement.legoffic@foss.st.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xu Yang <xu.yang_2@nxp.com>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: Ze Gao <zegao2021@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240907050830.6752-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-list.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index 82cb4b1010aa..65b8cba324be 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -173,7 +173,7 @@ static void default_print_event(void *ps, const char *pmu_name, const char *topi
 		if (pmu_name && strcmp(pmu_name, "default_core")) {
 			desc_len = strlen(desc);
 			desc_len = asprintf(&desc_with_unit,
-					    desc[desc_len - 1] != '.'
+					    desc_len > 0 && desc[desc_len - 1] != '.'
 					      ? "%s. Unit: %s" : "%s Unit: %s",
 					    desc, pmu_name);
 		}
-- 
cgit v1.2.3


From 925320737ae290ab4bcf9c277c2a7718113717ae Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 6 Sep 2024 22:08:17 -0700
Subject: perf pmus: Fake PMU clean up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rather than passing a fake PMU around, just pass that the fake PMU
should be used - true when doing testing. Move the fake PMU into
pmus.[ch] and try to abstract the PMU's properties in pmu.c, ie so
there is less "if fake_pmu" in non-PMU code. Give the fake PMU a made
up type number.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Clément Le Goffic <clement.legoffic@foss.st.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xu Yang <xu.yang_2@nxp.com>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: Ze Gao <zegao2021@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20240907050830.6752-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/parse-events.c |  4 ++--
 tools/perf/tests/pmu-events.c   | 12 +++++-------
 tools/perf/util/metricgroup.c   | 10 +++++-----
 tools/perf/util/parse-events.c  | 33 +++++++++++++++++----------------
 tools/perf/util/parse-events.h  |  8 ++++----
 tools/perf/util/pmu.c           | 21 +++++++++++++++------
 tools/perf/util/pmu.h           |  7 ++++---
 tools/perf/util/pmus.c          | 11 +++++++++++
 tools/perf/util/pmus.h          |  1 +
 9 files changed, 64 insertions(+), 43 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index e228511c85a5..9e3086d02150 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -2500,7 +2500,7 @@ static int test_event(const struct evlist_test *e)
 		return TEST_FAIL;
 	}
 	parse_events_error__init(&err);
-	ret = __parse_events(evlist, e->name, /*pmu_filter=*/NULL, &err, /*fake_pmu=*/NULL,
+	ret = __parse_events(evlist, e->name, /*pmu_filter=*/NULL, &err, /*fake_pmu=*/false,
 			     /*warn_if_reordered=*/true, /*fake_tp=*/true);
 	if (ret) {
 		pr_debug("failed to parse event '%s', err %d\n", e->name, ret);
@@ -2529,7 +2529,7 @@ static int test_event_fake_pmu(const char *str)
 
 	parse_events_error__init(&err);
 	ret = __parse_events(evlist, str, /*pmu_filter=*/NULL, &err,
-			     &perf_pmu__fake, /*warn_if_reordered=*/true,
+			     /*fake_pmu=*/true, /*warn_if_reordered=*/true,
 			     /*fake_tp=*/true);
 	if (ret) {
 		pr_debug("failed to parse event '%s', err %d\n",
diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c
index ff3e7bc0a77f..db004d26fcb0 100644
--- a/tools/perf/tests/pmu-events.c
+++ b/tools/perf/tests/pmu-events.c
@@ -819,8 +819,7 @@ static bool is_number(const char *str)
 	return errno == 0 && end_ptr != str;
 }
 
-static int check_parse_id(const char *id, struct parse_events_error *error,
-			  struct perf_pmu *fake_pmu)
+static int check_parse_id(const char *id, struct parse_events_error *error)
 {
 	struct evlist *evlist;
 	int ret;
@@ -841,7 +840,7 @@ static int check_parse_id(const char *id, struct parse_events_error *error,
 	for (cur = strchr(dup, '@') ; cur; cur = strchr(++cur, '@'))
 		*cur = '/';
 
-	ret = __parse_events(evlist, dup, /*pmu_filter=*/NULL, error, fake_pmu,
+	ret = __parse_events(evlist, dup, /*pmu_filter=*/NULL, error, /*fake_pmu=*/true,
 			     /*warn_if_reordered=*/true, /*fake_tp=*/false);
 	free(dup);
 
@@ -855,7 +854,7 @@ static int check_parse_fake(const char *id)
 	int ret;
 
 	parse_events_error__init(&error);
-	ret = check_parse_id(id, &error, &perf_pmu__fake);
+	ret = check_parse_id(id, &error);
 	parse_events_error__exit(&error);
 	return ret;
 }
@@ -1051,9 +1050,8 @@ static int test__parsing_fake_callback(const struct pmu_metric *pm,
 }
 
 /*
- * Parse all the metrics for current architecture,
- * or all defined cpus via the 'fake_pmu'
- * in parse_events.
+ * Parse all the metrics for current architecture, or all defined cpus via the
+ * 'fake_pmu' in parse_events.
  */
 static int test__parsing_fake(struct test_suite *test __maybe_unused,
 			      int subtest __maybe_unused)
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 69f6a46402c3..4dff3e925a47 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -1436,7 +1436,7 @@ err_out:
  * parse_ids - Build the event string for the ids and parse them creating an
  *             evlist. The encoded metric_ids are decoded.
  * @metric_no_merge: is metric sharing explicitly disabled.
- * @fake_pmu: used when testing metrics not supported by the current CPU.
+ * @fake_pmu: use a fake PMU when testing metrics not supported by the current CPU.
  * @ids: the event identifiers parsed from a metric.
  * @modifier: any modifiers added to the events.
  * @group_events: should events be placed in a weak group.
@@ -1444,7 +1444,7 @@ err_out:
  *               the overall list of metrics.
  * @out_evlist: the created list of events.
  */
-static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu,
+static int parse_ids(bool metric_no_merge, bool fake_pmu,
 		     struct expr_parse_ctx *ids, const char *modifier,
 		     bool group_events, const bool tool_events[PERF_TOOL_MAX],
 		     struct evlist **out_evlist)
@@ -1528,7 +1528,7 @@ static int parse_groups(struct evlist *perf_evlist,
 			bool metric_no_threshold,
 			const char *user_requested_cpu_list,
 			bool system_wide,
-			struct perf_pmu *fake_pmu,
+			bool fake_pmu,
 			struct rblist *metric_events_list,
 			const struct pmu_metrics_table *table)
 {
@@ -1703,7 +1703,7 @@ int metricgroup__parse_groups(struct evlist *perf_evlist,
 
 	return parse_groups(perf_evlist, pmu, str, metric_no_group, metric_no_merge,
 			    metric_no_threshold, user_requested_cpu_list, system_wide,
-			    /*fake_pmu=*/NULL, metric_events, table);
+			    /*fake_pmu=*/false, metric_events, table);
 }
 
 int metricgroup__parse_groups_test(struct evlist *evlist,
@@ -1717,7 +1717,7 @@ int metricgroup__parse_groups_test(struct evlist *evlist,
 			    /*metric_no_threshold=*/false,
 			    /*user_requested_cpu_list=*/NULL,
 			    /*system_wide=*/false,
-			    &perf_pmu__fake, metric_events, table);
+			    /*fake_pmu=*/true, metric_events, table);
 }
 
 struct metricgroup__has_metric_data {
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f5eb1af8302c..9a8be1e46d67 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1500,8 +1500,8 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 	}
 
 	/* Look for event names in the terms and rewrite into format based terms. */
-	if (!parse_state->fake_pmu && perf_pmu__check_alias(pmu, &parsed_terms,
-							    &info, &alias_rewrote_terms, err)) {
+	if (perf_pmu__check_alias(pmu, &parsed_terms,
+				  &info, &alias_rewrote_terms, err)) {
 		parse_events_terms__exit(&parsed_terms);
 		return -EINVAL;
 	}
@@ -1537,8 +1537,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 		return -ENOMEM;
 	}
 
-	if (!parse_state->fake_pmu &&
-	    perf_pmu__config(pmu, &attr, &parsed_terms, parse_state->error)) {
+	if (perf_pmu__config(pmu, &attr, &parsed_terms, parse_state->error)) {
 		free_config_terms(&config_terms);
 		parse_events_terms__exit(&parsed_terms);
 		return -EINVAL;
@@ -1558,11 +1557,6 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 
 	evsel->percore = config_term_percore(&evsel->config_terms);
 
-	if (parse_state->fake_pmu) {
-		parse_events_terms__exit(&parsed_terms);
-		return 0;
-	}
-
 	parse_events_terms__exit(&parsed_terms);
 	free((char *)evsel->unit);
 	evsel->unit = strdup(info.unit);
@@ -1638,13 +1632,13 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
 	}
 
 	if (parse_state->fake_pmu) {
-		if (!parse_events_add_pmu(parse_state, list, parse_state->fake_pmu, &parsed_terms,
+		if (!parse_events_add_pmu(parse_state, list, perf_pmus__fake_pmu(), &parsed_terms,
 					  /*auto_merge_stats=*/true)) {
 			struct strbuf sb;
 
 			strbuf_init(&sb, /*hint=*/ 0);
 			parse_events_terms__to_strbuf(&parsed_terms, &sb);
-			pr_debug("%s -> %s/%s/\n", event_name, "fake_pmu", sb.buf);
+			pr_debug("%s -> fake/%s/\n", event_name, sb.buf);
 			strbuf_release(&sb);
 			ok++;
 		}
@@ -1678,11 +1672,18 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state
 	INIT_LIST_HEAD(*listp);
 
 	/* Attempt to add to list assuming event_or_pmu is a PMU name. */
-	pmu = parse_state->fake_pmu ?: perf_pmus__find(event_or_pmu);
+	pmu = perf_pmus__find(event_or_pmu);
 	if (pmu && !parse_events_add_pmu(parse_state, *listp, pmu, const_parsed_terms,
 					/*auto_merge_stats=*/false))
 		return 0;
 
+	if (parse_state->fake_pmu) {
+		if (!parse_events_add_pmu(parse_state, *listp, perf_pmus__fake_pmu(),
+					  const_parsed_terms,
+					  /*auto_merge_stats=*/false))
+			return 0;
+	}
+
 	pmu = NULL;
 	/* Failed to add, try wildcard expansion of event_or_pmu as a PMU name. */
 	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
@@ -1983,8 +1984,8 @@ static int evsel__compute_group_pmu_name(struct evsel *evsel,
 			}
 		}
 	}
-	/* Assign the actual name taking care that the fake PMU lacks a name. */
-	evsel->group_pmu_name = strdup(group_pmu_name ?: "fake");
+	/* Record computed name. */
+	evsel->group_pmu_name = strdup(group_pmu_name);
 	return evsel->group_pmu_name ? 0 : -ENOMEM;
 }
 
@@ -2146,7 +2147,7 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list)
 }
 
 int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter,
-		   struct parse_events_error *err, struct perf_pmu *fake_pmu,
+		   struct parse_events_error *err, bool fake_pmu,
 		   bool warn_if_reordered, bool fake_tp)
 {
 	struct parse_events_state parse_state = {
@@ -2365,7 +2366,7 @@ int parse_events_option(const struct option *opt, const char *str,
 
 	parse_events_error__init(&err);
 	ret = __parse_events(*args->evlistp, str, args->pmu_filter, &err,
-			     /*fake_pmu=*/NULL, /*warn_if_reordered=*/true,
+			     /*fake_pmu=*/false, /*warn_if_reordered=*/true,
 			     /*fake_tp=*/false);
 
 	if (ret) {
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 68bfea9ffa70..10cc9c433116 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -31,14 +31,14 @@ int parse_events_option(const struct option *opt, const char *str, int unset);
 int parse_events_option_new_evlist(const struct option *opt, const char *str, int unset);
 __attribute__((nonnull(1, 2, 4)))
 int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter,
-		   struct parse_events_error *error, struct perf_pmu *fake_pmu,
+		   struct parse_events_error *error, bool fake_pmu,
 		   bool warn_if_reordered, bool fake_tp);
 
 __attribute__((nonnull(1, 2, 3)))
 static inline int parse_events(struct evlist *evlist, const char *str,
 			       struct parse_events_error *err)
 {
-	return __parse_events(evlist, str, /*pmu_filter=*/NULL, err, /*fake_pmu=*/NULL,
+	return __parse_events(evlist, str, /*pmu_filter=*/NULL, err, /*fake_pmu=*/false,
 			      /*warn_if_reordered=*/true, /*fake_tp=*/false);
 }
 
@@ -150,8 +150,8 @@ struct parse_events_state {
 	struct parse_events_terms *terms;
 	/* Start token. */
 	int			   stoken;
-	/* Special fake PMU marker for testing. */
-	struct perf_pmu		  *fake_pmu;
+	/* Use the fake PMU marker for testing. */
+	bool			   fake_pmu;
 	/* Skip actual tracepoint processing for testing. */
 	bool			   fake_tp;
 	/* If non-null, when wildcard matching only match the given PMU. */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index ef8ab918eaeb..2d85c4482f5e 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -30,10 +30,6 @@
 #include "util/evsel_config.h"
 #include <regex.h>
 
-struct perf_pmu perf_pmu__fake = {
-	.name = "fake",
-};
-
 #define UNIT_MAX_LEN	31 /* max length for event unit name */
 
 enum event_source {
@@ -1172,6 +1168,11 @@ struct perf_pmu *perf_pmu__create_placeholder_core_pmu(struct list_head *core_pm
 	return pmu;
 }
 
+static bool perf_pmu__is_fake(const struct perf_pmu *pmu)
+{
+	return pmu->type == PERF_PMU_TYPE_FAKE;
+}
+
 void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu)
 {
 	struct perf_pmu_format *format;
@@ -1182,7 +1183,7 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu)
 	pmu->formats_checked = true;
 
 	/* fake pmu doesn't have format list */
-	if (pmu == &perf_pmu__fake)
+	if (perf_pmu__is_fake(pmu))
 		return;
 
 	list_for_each_entry(format, &pmu->format, list) {
@@ -1510,6 +1511,10 @@ int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr,
 {
 	bool zero = !!pmu->perf_event_attr_init_default;
 
+	/* Fake PMU doesn't have proper terms so nothing to configure in attr. */
+	if (perf_pmu__is_fake(pmu))
+		return 0;
+
 	return perf_pmu__config_terms(pmu, attr, head_terms, zero, err);
 }
 
@@ -1618,6 +1623,10 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
 	info->scale    = 0.0;
 	info->snapshot = false;
 
+	/* Fake PMU doesn't rewrite terms. */
+	if (perf_pmu__is_fake(pmu))
+		goto out;
+
 	list_for_each_entry_safe(term, h, &head_terms->terms, list) {
 		alias = pmu_find_alias(pmu, term);
 		if (!alias)
@@ -1640,7 +1649,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
 		list_del_init(&term->list);
 		parse_events_term__delete(term);
 	}
-
+out:
 	/*
 	 * if no unit or scale found in aliases, then
 	 * set defaults as for evsel
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index b2d3fd291f02..d6f702e55a05 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -36,6 +36,10 @@ struct perf_pmu_caps {
 	struct list_head list;
 };
 
+enum {
+	PERF_PMU_TYPE_FAKE = 0xFFFFFFFF,
+};
+
 /**
  * struct perf_pmu
  */
@@ -173,9 +177,6 @@ struct perf_pmu {
 	struct perf_mem_event *mem_events;
 };
 
-/** @perf_pmu__fake: A special global PMU used for testing. */
-extern struct perf_pmu perf_pmu__fake;
-
 struct perf_pmu_info {
 	const char *unit;
 	double scale;
diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
index 769b920d9250..7e201e422812 100644
--- a/tools/perf/util/pmus.c
+++ b/tools/perf/util/pmus.c
@@ -720,3 +720,14 @@ struct perf_pmu *perf_pmus__add_test_pmu(int test_sysfs_dirfd, const char *name)
 	 */
 	return perf_pmu__lookup(&other_pmus, test_sysfs_dirfd, name, /*eager_load=*/true);
 }
+
+struct perf_pmu *perf_pmus__fake_pmu(void)
+{
+	static struct perf_pmu fake = {
+		.name = "fake",
+		.type = PERF_PMU_TYPE_FAKE,
+		.format = LIST_HEAD_INIT(fake.format),
+	};
+
+	return &fake;
+}
diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h
index bdbff02324bb..e1742b56eec7 100644
--- a/tools/perf/util/pmus.h
+++ b/tools/perf/util/pmus.h
@@ -30,5 +30,6 @@ bool perf_pmus__supports_extended_type(void);
 char *perf_pmus__default_pmu_name(void);
 
 struct perf_pmu *perf_pmus__add_test_pmu(int test_sysfs_dirfd, const char *name);
+struct perf_pmu *perf_pmus__fake_pmu(void);
 
 #endif /* __PMUS_H */
-- 
cgit v1.2.3


From f08cc258431df0ac498a4700d2d5b6f6aebb4889 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 6 Sep 2024 22:08:18 -0700
Subject: perf evsel: Add accessor for tool_event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently tool events use a dedicated variable within the evsel. Later
changes will move this to the unused struct perf_event_attr config for
these events. Add an accessor to allow the later change to be well
typed and avoid changing all uses.

Signed-off-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20240907050830.6752-4-irogers@google.com
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Clément Le Goffic <clement.legoffic@foss.st.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Xu Yang <xu.yang_2@nxp.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-perf-users@vger.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c     |  6 +++---
 tools/perf/util/evsel.c       | 24 ++++++++++++------------
 tools/perf/util/evsel.h       |  5 +++++
 tools/perf/util/stat-shadow.c |  2 +-
 4 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index cf985cdb9a6e..689a3d43c258 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -294,14 +294,14 @@ static int read_single_counter(struct evsel *counter, int cpu_map_idx, int threa
 	 * terminates. Use the wait4 values in that case.
 	 */
 	if (err && cpu_map_idx == 0 &&
-	    (counter->tool_event == PERF_TOOL_USER_TIME ||
-	     counter->tool_event == PERF_TOOL_SYSTEM_TIME)) {
+	    (evsel__tool_event(counter) == PERF_TOOL_USER_TIME ||
+	     evsel__tool_event(counter) == PERF_TOOL_SYSTEM_TIME)) {
 		u64 val, *start_time;
 		struct perf_counts_values *count =
 			perf_counts(counter->counts, cpu_map_idx, thread);
 
 		start_time = xyarray__entry(counter->start_times, cpu_map_idx, thread);
-		if (counter->tool_event == PERF_TOOL_USER_TIME)
+		if (evsel__tool_event(counter) == PERF_TOOL_USER_TIME)
 			val = ru_stats.ru_utime_usec_stat.mean;
 		else
 			val = ru_stats.ru_stime_usec_stat.mean;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 49cc71511c0c..dbf9c8cee3c5 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -773,7 +773,7 @@ const char *evsel__name(struct evsel *evsel)
 
 	case PERF_TYPE_SOFTWARE:
 		if (evsel__is_tool(evsel))
-			evsel__tool_name(evsel->tool_event, bf, sizeof(bf));
+			evsel__tool_name(evsel__tool_event(evsel), bf, sizeof(bf));
 		else
 			evsel__sw_name(evsel, bf, sizeof(bf));
 		break;
@@ -811,7 +811,7 @@ const char *evsel__metric_id(const struct evsel *evsel)
 		return evsel->metric_id;
 
 	if (evsel__is_tool(evsel))
-		return perf_tool_event__to_str(evsel->tool_event);
+		return perf_tool_event__to_str(evsel__tool_event(evsel));
 
 	return "unknown";
 }
@@ -1503,8 +1503,8 @@ void evsel__exit(struct evsel *evsel)
 	evsel->per_pkg_mask = NULL;
 	zfree(&evsel->metric_events);
 	perf_evsel__object.fini(evsel);
-	if (evsel->tool_event == PERF_TOOL_SYSTEM_TIME ||
-	    evsel->tool_event == PERF_TOOL_USER_TIME)
+	if (evsel__tool_event(evsel) == PERF_TOOL_SYSTEM_TIME ||
+	    evsel__tool_event(evsel) == PERF_TOOL_USER_TIME)
 		xyarray__delete(evsel->start_times);
 }
 
@@ -1785,7 +1785,7 @@ static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
 
 	count = perf_counts(evsel->counts, cpu_map_idx, thread);
 
-	switch (evsel->tool_event) {
+	switch (evsel__tool_event(evsel)) {
 	case PERF_TOOL_DURATION_TIME:
 		/*
 		 * Pretend duration_time is only on the first CPU and thread, or
@@ -1800,7 +1800,7 @@ static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
 		break;
 	case PERF_TOOL_USER_TIME:
 	case PERF_TOOL_SYSTEM_TIME: {
-		bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
+		bool system = evsel__tool_event(evsel) == PERF_TOOL_SYSTEM_TIME;
 
 		start_time = xyarray__entry(evsel->start_times, cpu_map_idx, thread);
 		fd = FD(evsel, cpu_map_idx, thread);
@@ -2072,8 +2072,8 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
 	    perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0)
 		return -ENOMEM;
 
-	if ((evsel->tool_event == PERF_TOOL_SYSTEM_TIME ||
-	     evsel->tool_event == PERF_TOOL_USER_TIME) &&
+	if ((evsel__tool_event(evsel) == PERF_TOOL_SYSTEM_TIME ||
+	     evsel__tool_event(evsel) == PERF_TOOL_USER_TIME) &&
 	    !evsel->start_times) {
 		evsel->start_times = xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
 		if (!evsel->start_times)
@@ -2262,7 +2262,7 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
 	int pid = -1, err, old_errno;
 	enum rlimit_action set_rlimit = NO_CHANGE;
 
-	if (evsel->tool_event == PERF_TOOL_DURATION_TIME) {
+	if (evsel__tool_event(evsel) == PERF_TOOL_DURATION_TIME) {
 		if (evsel->core.attr.sample_period) /* no sampling */
 			return -EINVAL;
 		evsel->start_time = rdclock();
@@ -2304,9 +2304,9 @@ retry_open:
 			if (!evsel->cgrp && !evsel->core.system_wide)
 				pid = perf_thread_map__pid(threads, thread);
 
-			if (evsel->tool_event == PERF_TOOL_USER_TIME ||
-			    evsel->tool_event == PERF_TOOL_SYSTEM_TIME) {
-				bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
+			if (evsel__tool_event(evsel) == PERF_TOOL_USER_TIME ||
+			    evsel__tool_event(evsel) == PERF_TOOL_SYSTEM_TIME) {
+				bool system = evsel__tool_event(evsel) == PERF_TOOL_SYSTEM_TIME;
 				__u64 *start_time = NULL;
 
 				if (evsel->core.attr.sample_period) {
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 15acf293e12a..15e745a9a798 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -330,6 +330,11 @@ static inline bool evsel__is_retire_lat(const struct evsel *evsel)
 	return evsel->retire_lat;
 }
 
+static inline enum perf_tool_event evsel__tool_event(const struct evsel *evsel)
+{
+	return evsel->tool_event;
+}
+
 const char *evsel__group_name(struct evsel *evsel);
 int evsel__group_desc(struct evsel *evsel, char *buf, size_t size);
 
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 6bb975e46de3..99376c12dd8e 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -380,7 +380,7 @@ static int prepare_metric(const struct metric_expr *mexp,
 			struct stats *stats;
 			double scale;
 
-			switch (metric_events[i]->tool_event) {
+			switch (evsel__tool_event(metric_events[i])) {
 			case PERF_TOOL_DURATION_TIME:
 				stats = &walltime_nsecs_stats;
 				scale = 1e-9;
-- 
cgit v1.2.3


From 89c0a55e550ebb1fd19bba72fc08c8f6e2d3b1db Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 6 Sep 2024 22:08:19 -0700
Subject: perf pmu: To info add event_type_desc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All PMU events are assumed to be "Kernel PMU event", however, this
isn't true for fake PMUs and won't be true with the addition of more
software PMUs. Make the PMU's type description name configurable -
largely for printing callbacks.

Signed-off-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20240907050830.6752-5-irogers@google.com
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Clément Le Goffic <clement.legoffic@foss.st.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: Changbin Du <changbin.du@huawei.com>
Cc: Junhao He <hejunhao3@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Sun Haiyong <sunhaiyong@loongson.cn>
Cc: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Xu Yang <xu.yang_2@nxp.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Veronika Molnarova <vmolnaro@redhat.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-perf-users@vger.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/pmu.c  | 1 +
 tools/perf/util/pmu.h  | 1 +
 tools/perf/util/pmus.c | 5 ++++-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 2d85c4482f5e..61bdda01a05a 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1856,6 +1856,7 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus,
 	struct perf_pmu_alias *event;
 	struct pmu_event_info info = {
 		.pmu = pmu,
+		.event_type_desc = "Kernel PMU event",
 	};
 	int ret = 0;
 	struct strbuf sb;
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index d6f702e55a05..4397c48ad569 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -194,6 +194,7 @@ struct pmu_event_info {
 	const char *encoding_desc;
 	const char *topic;
 	const char *pmu_name;
+	const char *event_type_desc;
 	const char *str;
 	bool deprecated;
 };
diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
index 7e201e422812..52109af5f2f1 100644
--- a/tools/perf/util/pmus.c
+++ b/tools/perf/util/pmus.c
@@ -371,6 +371,7 @@ struct sevent {
 	const char *encoding_desc;
 	const char *topic;
 	const char *pmu_name;
+	const char *event_type_desc;
 	bool deprecated;
 };
 
@@ -444,6 +445,7 @@ static int perf_pmus__print_pmu_events__callback(void *vstate,
 	COPY_STR(encoding_desc);
 	COPY_STR(topic);
 	COPY_STR(pmu_name);
+	COPY_STR(event_type_desc);
 #undef COPY_STR
 	s->deprecated = info->deprecated;
 	state->index++;
@@ -498,7 +500,7 @@ void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *p
 				aliases[j].alias,
 				aliases[j].scale_unit,
 				aliases[j].deprecated,
-				"Kernel PMU event",
+				aliases[j].event_type_desc,
 				aliases[j].desc,
 				aliases[j].long_desc,
 				aliases[j].encoding_desc);
@@ -511,6 +513,7 @@ free:
 		zfree(&aliases[j].encoding_desc);
 		zfree(&aliases[j].topic);
 		zfree(&aliases[j].pmu_name);
+		zfree(&aliases[j].event_type_desc);
 	}
 	if (printed && pager_in_use())
 		printf("\n");
-- 
cgit v1.2.3


From a0474b8d5974e142461ac7584c996feea167bcc1 Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Wed, 11 Sep 2024 12:42:30 +0800
Subject: selftests: kselftest: Use strerror() on nolibc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Nolibc gained an implementation of strerror() recently.
Use it and drop the ifndef.

Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Acked-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index e195ec156859..29fedf609611 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -373,15 +373,7 @@ static inline __noreturn __printf(1, 2) void ksft_exit_fail_msg(const char *msg,
 
 static inline __noreturn void ksft_exit_fail_perror(const char *msg)
 {
-#ifndef NOLIBC
 	ksft_exit_fail_msg("%s: %s (%d)\n", msg, strerror(errno), errno);
-#else
-	/*
-	 * nolibc doesn't provide strerror() and it seems
-	 * inappropriate to add one, just print the errno.
-	 */
-	ksft_exit_fail_msg("%s: %d)\n", msg, errno);
-#endif
 }
 
 static inline __noreturn void ksft_exit_xfail(void)
-- 
cgit v1.2.3


From 2060f07f861a237345922023e9347a204c0795af Mon Sep 17 00:00:00 2001
From: Philo Lu <lulie@linux.alibaba.com>
Date: Wed, 11 Sep 2024 11:37:16 +0800
Subject: selftests/bpf: Add test for __nullable suffix in tp_btf

Add a tracepoint with __nullable suffix in bpf_testmod, and add cases
for it:

$ ./test_progs -t "tp_btf_nullable"
 #406/1   tp_btf_nullable/handle_tp_btf_nullable_bare1:OK
 #406/2   tp_btf_nullable/handle_tp_btf_nullable_bare2:OK
 #406     tp_btf_nullable:OK
 Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240911033719.91468-3-lulie@linux.alibaba.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod-events.h |  6 ++++++
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  2 ++
 .../selftests/bpf/prog_tests/tp_btf_nullable.c     | 14 +++++++++++++
 .../selftests/bpf/progs/test_tp_btf_nullable.c     | 24 ++++++++++++++++++++++
 4 files changed, 46 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
index 11ee801e75e7..6c3b4d4f173a 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
@@ -34,6 +34,12 @@ DECLARE_TRACE(bpf_testmod_test_write_bare,
 	TP_ARGS(task, ctx)
 );
 
+/* Used in bpf_testmod_test_read() to test __nullable suffix */
+DECLARE_TRACE(bpf_testmod_test_nullable_bare,
+	TP_PROTO(struct bpf_testmod_test_read_ctx *ctx__nullable),
+	TP_ARGS(ctx__nullable)
+);
+
 #undef BPF_TESTMOD_DECLARE_TRACE
 #ifdef DECLARE_TRACE_WRITABLE
 #define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index fd28c1157bd3..a32771da4293 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -356,6 +356,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
 	if (bpf_testmod_loop_test(101) > 100)
 		trace_bpf_testmod_test_read(current, &ctx);
 
+	trace_bpf_testmod_test_nullable_bare(NULL);
+
 	/* Magic number to enable writable tp */
 	if (len == 64) {
 		struct bpf_testmod_test_writable_ctx writable = {
diff --git a/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c b/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c
new file mode 100644
index 000000000000..accc42e01f8a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "test_tp_btf_nullable.skel.h"
+
+void test_tp_btf_nullable(void)
+{
+	if (!env.has_testmod) {
+		test__skip();
+		return;
+	}
+
+	RUN_TESTS(test_tp_btf_nullable);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
new file mode 100644
index 000000000000..bba3e37f749b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+#include "bpf_misc.h"
+
+SEC("tp_btf/bpf_testmod_test_nullable_bare")
+__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")
+int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx)
+{
+	return nullable_ctx->len;
+}
+
+SEC("tp_btf/bpf_testmod_test_nullable_bare")
+int BPF_PROG(handle_tp_btf_nullable_bare2, struct bpf_testmod_test_read_ctx *nullable_ctx)
+{
+	if (nullable_ctx)
+		return nullable_ctx->len;
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 83dff601715bdc086dc1fc470ee3aaff42215e65 Mon Sep 17 00:00:00 2001
From: Philo Lu <lulie@linux.alibaba.com>
Date: Wed, 11 Sep 2024 11:37:19 +0800
Subject: selftests/bpf: Expand skb dynptr selftests for tp_btf

Add 3 test cases for skb dynptr used in tp_btf:
- test_dynptr_skb_tp_btf: use skb dynptr in tp_btf and make sure it is
  read-only.
- skb_invalid_ctx_fentry/skb_invalid_ctx_fexit: bpf_dynptr_from_skb
  should fail in fentry/fexit.

In test_dynptr_skb_tp_btf, to trigger the tracepoint in kfree_skb,
test_pkt_access is used for its test_run, as in kfree_skb.c. Because the
test process is different from others, a new setup type is defined,
i.e., SETUP_SKB_PROG_TP.

The result is like:
$ ./test_progs -t 'dynptr/test_dynptr_skb_tp_btf'
  #84/14   dynptr/test_dynptr_skb_tp_btf:OK
  #84      dynptr:OK
  #127     kfunc_dynptr_param:OK
  Summary: 2/1 PASSED, 0 SKIPPED, 0 FAILED

$ ./test_progs -t 'dynptr/skb_invalid_ctx_f'
  #84/85   dynptr/skb_invalid_ctx_fentry:OK
  #84/86   dynptr/skb_invalid_ctx_fexit:OK
  #84      dynptr:OK
  #127     kfunc_dynptr_param:OK
  Summary: 2/2 PASSED, 0 SKIPPED, 0 FAILED

Also fix two coding style nits (change spaces to tabs).

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240911033719.91468-6-lulie@linux.alibaba.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/dynptr.c    | 37 ++++++++++++++++++++--
 tools/testing/selftests/bpf/progs/dynptr_fail.c    | 25 +++++++++++++++
 tools/testing/selftests/bpf/progs/dynptr_success.c | 23 ++++++++++++++
 3 files changed, 83 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index 7cfac53c0d58..b614a5272dfd 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -9,6 +9,7 @@
 enum test_setup_type {
 	SETUP_SYSCALL_SLEEP,
 	SETUP_SKB_PROG,
+	SETUP_SKB_PROG_TP,
 };
 
 static struct {
@@ -28,6 +29,7 @@ static struct {
 	{"test_dynptr_clone", SETUP_SKB_PROG},
 	{"test_dynptr_skb_no_buff", SETUP_SKB_PROG},
 	{"test_dynptr_skb_strcmp", SETUP_SKB_PROG},
+	{"test_dynptr_skb_tp_btf", SETUP_SKB_PROG_TP},
 };
 
 static void verify_success(const char *prog_name, enum test_setup_type setup_type)
@@ -35,7 +37,7 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ
 	struct dynptr_success *skel;
 	struct bpf_program *prog;
 	struct bpf_link *link;
-       int err;
+	int err;
 
 	skel = dynptr_success__open();
 	if (!ASSERT_OK_PTR(skel, "dynptr_success__open"))
@@ -47,7 +49,7 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ
 	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
 		goto cleanup;
 
-       bpf_program__set_autoload(prog, true);
+	bpf_program__set_autoload(prog, true);
 
 	err = dynptr_success__load(skel);
 	if (!ASSERT_OK(err, "dynptr_success__load"))
@@ -87,6 +89,37 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ
 
 		break;
 	}
+	case SETUP_SKB_PROG_TP:
+	{
+		struct __sk_buff skb = {};
+		struct bpf_object *obj;
+		int aux_prog_fd;
+
+		/* Just use its test_run to trigger kfree_skb tracepoint */
+		err = bpf_prog_test_load("./test_pkt_access.bpf.o", BPF_PROG_TYPE_SCHED_CLS,
+					 &obj, &aux_prog_fd);
+		if (!ASSERT_OK(err, "prog_load sched cls"))
+			goto cleanup;
+
+		LIBBPF_OPTS(bpf_test_run_opts, topts,
+			    .data_in = &pkt_v4,
+			    .data_size_in = sizeof(pkt_v4),
+			    .ctx_in = &skb,
+			    .ctx_size_in = sizeof(skb),
+		);
+
+		link = bpf_program__attach(prog);
+		if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+			goto cleanup;
+
+		err = bpf_prog_test_run_opts(aux_prog_fd, &topts);
+		bpf_link__destroy(link);
+
+		if (!ASSERT_OK(err, "test_run"))
+			goto cleanup;
+
+		break;
+	}
 	}
 
 	ASSERT_EQ(skel->bss->err, 0, "err");
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index e35bc1eac52a..c3bc186af21e 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -6,6 +6,7 @@
 #include <stdbool.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
 #include <linux/if_ether.h>
 #include "bpf_misc.h"
 #include "bpf_kfuncs.h"
@@ -1254,6 +1255,30 @@ int skb_invalid_ctx(void *ctx)
 	return 0;
 }
 
+SEC("fentry/skb_tx_error")
+__failure __msg("must be referenced or trusted")
+int BPF_PROG(skb_invalid_ctx_fentry, void *skb)
+{
+	struct bpf_dynptr ptr;
+
+	/* this should fail */
+	bpf_dynptr_from_skb(skb, 0, &ptr);
+
+	return 0;
+}
+
+SEC("fexit/skb_tx_error")
+__failure __msg("must be referenced or trusted")
+int BPF_PROG(skb_invalid_ctx_fexit, void *skb)
+{
+	struct bpf_dynptr ptr;
+
+	/* this should fail */
+	bpf_dynptr_from_skb(skb, 0, &ptr);
+
+	return 0;
+}
+
 /* Reject writes to dynptr slot for uninit arg */
 SEC("?raw_tp")
 __failure __msg("potential write to dynptr at off=-16")
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index 5985920d162e..bfcc85686cf0 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -5,6 +5,7 @@
 #include <stdbool.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
 #include "bpf_kfuncs.h"
 #include "errno.h"
@@ -544,3 +545,25 @@ int test_dynptr_skb_strcmp(struct __sk_buff *skb)
 
 	return 1;
 }
+
+SEC("tp_btf/kfree_skb")
+int BPF_PROG(test_dynptr_skb_tp_btf, void *skb, void *location)
+{
+	__u8 write_data[2] = {1, 2};
+	struct bpf_dynptr ptr;
+	int ret;
+
+	if (bpf_dynptr_from_skb(skb, 0, &ptr)) {
+		err = 1;
+		return 1;
+	}
+
+	/* since tp_btf skbs are read only, writes should fail */
+	ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
+	if (ret != -EINVAL) {
+		err = 2;
+		return 1;
+	}
+
+	return 1;
+}
-- 
cgit v1.2.3


From eb9b9a6f5ab35db7a431184456fe410b792be03f Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Sun, 8 Sep 2024 19:46:41 +0100
Subject: tools: Drop nonsensical -O6

-O6 is very much not-a-thing. Really, this should've been dropped
entirely in 49b3cd306e60b9d8 ("tools: Set the maximum optimization level
according to the compiler being used") instead of just passing it for
not-Clang.

Just collapse it down to -O3, instead of "-O6 unless Clang, in which case
-O3".

GCC interprets > -O3 as -O3. It doesn't even interpret > -O3 as -Ofast,
which is a good thing, given -Ofast has specific (non-)requirements for
code built using it. So, this does nothing except look a bit daft.

Remove the silliness and also save a few lines in the Makefiles accordingly.

Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: Jesper Juhl <jesperjuhl76@gmail.com>
Signed-off-by: Sam James <sam@gentoo.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: llvm@lists.linux.dev
Link: https://lore.kernel.org/r/4f01524fa4ea91c7146a41e26ceaf9dae4c127e4.1725821201.git.sam@gentoo.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/api/Makefile     | 4 ----
 tools/lib/subcmd/Makefile  | 4 +---
 tools/lib/symbol/Makefile  | 4 ----
 tools/perf/Makefile.config | 6 +-----
 4 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile
index 044860ac1ed1..7f6396087b46 100644
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -31,11 +31,7 @@ CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
 CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -U_FORTIFY_SOURCE -fPIC
 
 ifeq ($(DEBUG),0)
-ifeq ($(CC_NO_CLANG), 0)
   CFLAGS += -O3
-else
-  CFLAGS += -O6
-endif
 endif
 
 ifeq ($(DEBUG),0)
diff --git a/tools/lib/subcmd/Makefile b/tools/lib/subcmd/Makefile
index 59b09f280e49..8703ab487b68 100644
--- a/tools/lib/subcmd/Makefile
+++ b/tools/lib/subcmd/Makefile
@@ -38,10 +38,8 @@ endif
 
 ifeq ($(DEBUG),1)
   CFLAGS += -O0
-else ifeq ($(CC_NO_CLANG), 0)
-  CFLAGS += -O3
 else
-  CFLAGS += -O6
+  CFLAGS += -O3
 endif
 
 # Treat warnings as errors unless directed not to
diff --git a/tools/lib/symbol/Makefile b/tools/lib/symbol/Makefile
index 13d43c6f92b4..426b845edfac 100644
--- a/tools/lib/symbol/Makefile
+++ b/tools/lib/symbol/Makefile
@@ -31,11 +31,7 @@ CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
 CFLAGS += -ggdb3 -Wall -Wextra -std=gnu11 -U_FORTIFY_SOURCE -fPIC
 
 ifeq ($(DEBUG),0)
-ifeq ($(CC_NO_CLANG), 0)
   CFLAGS += -O3
-else
-  CFLAGS += -O6
-endif
 endif
 
 ifeq ($(DEBUG),0)
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 4c1f75e326e2..4dcf7a0fd235 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -233,11 +233,7 @@ endif
 
 ifeq ($(DEBUG),0)
 CORE_CFLAGS += -DNDEBUG=1
-ifeq ($(CC_NO_CLANG), 0)
-  CORE_CFLAGS += -O3
-else
-  CORE_CFLAGS += -O6
-endif
+CORE_CFLAGS += -O3
 else
   CORE_CFLAGS += -g
   CXXFLAGS += -g
-- 
cgit v1.2.3


From 9953807c9e016759c86ec0d0ab6bfe223e19f0ba Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Sun, 8 Sep 2024 13:28:47 -0700
Subject: perf evlist: Print hint for group

An event group is a critical relationship. There is a -g option that can
display the relationship. But it's hard for a user to know when should
this option be applied.

If there is an event group in the perf record, print a hint to suggest
the user apply the -g to display the group information.

With the patch,

  $ perf record -e "{cycles,instructions},instructions" sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.024 MB perf.data (4 samples) ]
  $

  $ perf evlist
  cycles
  instructions
  instructions
  # Tip: use 'perf evlist -g' to show group information

  $ perf evlist -g
  {cycles,instructions}
  instructions
  $

Committer testing:

So for a perf.data file _with_ a group:

  root@number:~# perf evlist -g
  {cpu_core/branch-instructions/pp,cpu_core/branches/}
  dummy:u
  root@number:~# perf evlist
  cpu_core/branch-instructions/pp
  cpu_core/branches/
  dummy:u
  # Tip: use 'perf evlist -g' to show group information
  root@number:~#

Then for something _without_ a group, no hint:

  root@number:~# perf record ls
  <SNIP>
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.035 MB perf.data (7 samples) ]
  root@number:~# perf evlist
  cpu_atom/cycles/P
  cpu_core/cycles/P
  dummy:u
  root@number:~#

No suggestion, good.

Suggested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Closes: https://lore.kernel.org/lkml/ZttgvduaKsVn1r4p@x1/
Link: https://lore.kernel.org/r/20240908202847.176280-1-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-evlist.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index 818ab21c3f73..a9bd7bbef5a9 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -36,7 +36,7 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
 		.force     = details->force,
 	};
 	struct perf_tool tool;
-	bool has_tracepoint = false;
+	bool has_tracepoint = false, has_group = false;
 
 	perf_tool__init(&tool, /*ordered_events=*/false);
 	/* only needed for pipe mode */
@@ -54,11 +54,17 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
 
 		if (pos->core.attr.type == PERF_TYPE_TRACEPOINT)
 			has_tracepoint = true;
+
+		if (!evsel__is_group_leader(pos))
+			has_group = true;
 	}
 
 	if (has_tracepoint && !details->trace_fields)
 		printf("# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events\n");
 
+	if (has_group && !details->event_group)
+		printf("# Tip: use 'perf evlist -g' to show group information\n");
+
 	perf_session__delete(session);
 	return 0;
 }
-- 
cgit v1.2.3


From edf3ce0ed38e2d04a817984e4ea7f05b18102926 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 9 Sep 2024 11:42:00 -0700
Subject: perf env: Find correct branch counter info on hybrid

No event is printed in the "Branch Counter" column on hybrid machines.

For example,

  $ perf record -e "{cpu_core/branch-instructions/pp,cpu_core/branches/}:S" -j any,counter
  $ perf report --total-cycles

  # Branch counter abbr list:
  # cpu_core/branch-instructions/pp = A
  # cpu_core/branches/ = B
  # '-' No event occurs
  # '+' Event occurrences may be lost due to branch counter saturated
  #
  # Sampled Cycles%  Sampled Cycles  Avg Cycles%  Avg Cycles  Branch Counter
  # ...............  ..............  ...........  ..........  ..............
            44.54%          727.1K        0.00%           1   |+   |+   |
            36.31%          592.7K        0.00%           2   |+   |+   |
            17.83%          291.1K        0.00%           1   |+   |+   |

The branch counter information (br_cntr_width and br_cntr_nr) in the
perf_env is retrieved from the CPU_PMU_CAPS. However, the CPU_PMU_CAPS
is not available on hybrid machines. Without the width information, the
number of occurrences of an event cannot be calculated.

For a hybrid machine, the caps information should be retrieved from the
PMU_CAPS, and stored in the perf_env->pmu_caps.

Add a perf_env__find_br_cntr_info() to return the correct branch counter
information from the corresponding fields.

Committer notes:

While testing I couldn't s ee those "Branch counter" columns enabled by
pressing 'B' on the TUI, after reporting it to the list Kan explained
the situation:

<quote Kan Liang>
For a hybrid client, the "Branch Counter" feature is only supported
starting from the just released Lunar Lake. Perf falls back to only
"ANY" on your Raptor Lake.

The "The branch counter is not available" message is expected.

Here is the 'perf evlist' result from my Lunar Lake machine,

  # perf evlist -v
  cpu_core/branch-instructions/pp: type: 4 (cpu_core), size: 136, config: 0xc4 (branch-instructions), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|READ|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID|GROUP|LOST, disabled: 1, freq: 1, enable_on_exec: 1, precise_ip: 2, sample_id_all: 1, exclude_guest: 1, branch_sample_type: ANY|COUNTERS
  #
</quote>

Fixes: 6f9d8d1de2c61288 ("perf script: Add branch counters")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240909184201.553519-1-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c |  5 +++--
 tools/perf/util/annotate.c  |  5 +++--
 tools/perf/util/env.c       | 15 +++++++++++++++
 tools/perf/util/env.h       |  3 +++
 tools/perf/util/session.c   |  6 ++++--
 5 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index dbe792b52c5c..a644787fa9e1 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1241,10 +1241,11 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en,
 	}
 
 	if (PRINT_FIELD(BRCNTR)) {
-		unsigned int width = evsel__env(evsel)->br_cntr_width;
-		unsigned int i = 0, j, num, mask = (1L << width) - 1;
 		struct evsel *pos = evsel__leader(evsel);
+		unsigned int i = 0, j, num, mask, width;
 
+		perf_env__find_br_cntr_info(evsel__env(evsel), NULL, &width);
+		mask = (1L << width) - 1;
 		printed += fprintf(fp, "br_cntr: ");
 		evlist__for_each_entry_from(evsel->evlist, pos) {
 			if (!(pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS))
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 1a347a711dcf..37ce43c4eb8f 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -333,14 +333,15 @@ static int symbol__account_br_cntr(struct annotated_branch *branch,
 {
 	unsigned int br_cntr_nr = evsel__leader(evsel)->br_cntr_nr;
 	unsigned int base = evsel__leader(evsel)->br_cntr_idx;
-	unsigned int width = evsel__env(evsel)->br_cntr_width;
 	unsigned int off = offset * evsel->evlist->nr_br_cntr;
-	unsigned int i, mask = (1L << width) - 1;
 	u64 *branch_br_cntr = branch->br_cntr;
+	unsigned int i, mask, width;
 
 	if (!br_cntr || !branch_br_cntr)
 		return 0;
 
+	perf_env__find_br_cntr_info(evsel__env(evsel), NULL, &width);
+	mask = (1L << width) - 1;
 	for (i = 0; i < br_cntr_nr; i++) {
 		u64 cntr = (br_cntr >> i * width) & mask;
 
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index a459374d0a1a..1edbccfc3281 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -624,3 +624,18 @@ out:
 	free(cap_eq);
 	return NULL;
 }
+
+void perf_env__find_br_cntr_info(struct perf_env *env,
+				 unsigned int *nr,
+				 unsigned int *width)
+{
+	if (nr) {
+		*nr = env->cpu_pmu_caps ? env->br_cntr_nr :
+					  env->pmu_caps->br_cntr_nr;
+	}
+
+	if (width) {
+		*width = env->cpu_pmu_caps ? env->br_cntr_width :
+					     env->pmu_caps->br_cntr_width;
+	}
+}
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 2a2c37cc40b7..51b36c36019b 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -192,4 +192,7 @@ char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name,
 			     const char *cap);
 
 bool perf_env__has_pmu_mapping(struct perf_env *env, const char *pmu_name);
+void perf_env__find_br_cntr_info(struct perf_env *env,
+				 unsigned int *nr,
+				 unsigned int *width);
 #endif /* __PERF_ENV_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index b492300ec959..dbaf07bf6c5f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -856,7 +856,6 @@ static void branch_stack__printf(struct perf_sample *sample,
 	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	bool callstack = evsel__has_branch_callstack(evsel);
 	u64 *branch_stack_cntr = sample->branch_stack_cntr;
-	struct perf_env *env = evsel__env(evsel);
 	uint64_t i;
 
 	if (!callstack) {
@@ -900,8 +899,11 @@ static void branch_stack__printf(struct perf_sample *sample,
 	}
 
 	if (branch_stack_cntr) {
+		unsigned int br_cntr_width, br_cntr_nr;
+
+		perf_env__find_br_cntr_info(evsel__env(evsel), &br_cntr_nr, &br_cntr_width);
 		printf("... branch stack counters: nr:%" PRIu64 " (counter width: %u max counter nr:%u)\n",
-			sample->branch_stack->nr, env->br_cntr_width, env->br_cntr_nr);
+			sample->branch_stack->nr, br_cntr_width, br_cntr_nr);
 		for (i = 0; i < sample->branch_stack->nr; i++)
 			printf("..... %2"PRIu64": %016" PRIx64 "\n", i, branch_stack_cntr[i]);
 	}
-- 
cgit v1.2.3


From 21ff365b5c88c0bf8447989aadb5d8fe401c9cfc Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Tue, 10 Sep 2024 17:23:34 +0100
Subject: selftests/ring-buffer: Verify the entire meta-page padding

Improve the ring-buffer meta-page test coverage by checking for the
entire padding region to be 0 instead of just looking at the first 4
bytes.

Cc: linux-kselftest@vger.kernel.org
Link: https://lore.kernel.org/20240910162335.2993310-2-vdonnefort@google.com
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/testing/selftests/ring-buffer/map_test.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c
index 4bb0192e43f3..ba12fd31de87 100644
--- a/tools/testing/selftests/ring-buffer/map_test.c
+++ b/tools/testing/selftests/ring-buffer/map_test.c
@@ -231,15 +231,15 @@ TEST_F(map, data_mmap)
 
 	/* Verify meta-page padding */
 	if (desc->meta->meta_page_size > getpagesize()) {
-		void *addr;
-
 		data_len = desc->meta->meta_page_size;
 		data = mmap(NULL, data_len,
 			    PROT_READ, MAP_SHARED, desc->cpu_fd, 0);
 		ASSERT_NE(data, MAP_FAILED);
 
-		addr = (void *)((unsigned long)data + getpagesize());
-		ASSERT_EQ(*((int *)addr), 0);
+		for (int i = desc->meta->meta_struct_len;
+		     i < desc->meta->meta_page_size; i += sizeof(int))
+			ASSERT_EQ(*(int *)(data + i), 0);
+
 		munmap(data, data_len);
 	}
 }
-- 
cgit v1.2.3


From 75d7ff9aa0ae1a8d1b3f9c8c87dde3a4fbe9a2cf Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Tue, 10 Sep 2024 17:23:35 +0100
Subject: selftests/ring-buffer: Handle meta-page bigger than the system

Handle the case where the meta-page content is bigger than the system
page-size. This prepares the ground for extending features covered by
the meta-page.

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: linux-kselftest@vger.kernel.org
Link: https://lore.kernel.org/20240910162335.2993310-3-vdonnefort@google.com
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/testing/selftests/ring-buffer/map_test.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c
index ba12fd31de87..d10a847130fb 100644
--- a/tools/testing/selftests/ring-buffer/map_test.c
+++ b/tools/testing/selftests/ring-buffer/map_test.c
@@ -92,12 +92,22 @@ int tracefs_cpu_map(struct tracefs_cpu_map_desc *desc, int cpu)
 	if (desc->cpu_fd < 0)
 		return -ENODEV;
 
+again:
 	map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, desc->cpu_fd, 0);
 	if (map == MAP_FAILED)
 		return -errno;
 
 	desc->meta = (struct trace_buffer_meta *)map;
 
+	/* the meta-page is bigger than the original mapping */
+	if (page_size < desc->meta->meta_struct_len) {
+		int meta_page_size = desc->meta->meta_page_size;
+
+		munmap(desc->meta, page_size);
+		page_size = meta_page_size;
+		goto again;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3c217a182018e6c6d381b3fdc32626275eefbfb0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 29 Aug 2024 10:42:32 -0700
Subject: selftests/bpf: add build ID tests

Add a new set of tests validating behavior of capturing stack traces
with build ID. We extend uprobe_multi target binary with ability to
trigger uprobe (so that we can capture stack traces from it), but also
we allow to force build ID data to be either resident or non-resident in
memory (see also a comment about quirks of MADV_PAGEOUT).

That way we can validate that in non-sleepable context we won't get
build ID (as expected), but with sleepable uprobes we will get that
build ID regardless of it being physically present in memory.

Also, we add a small add-on linker script which reorders
.note.gnu.build-id section and puts it after (big) .text section,
putting build ID data outside of the very first page of ELF file. This
will test all the relaxations we did in build ID parsing logic in kernel
thanks to freader abstraction.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240829174232.3133883-11-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile              |   5 +-
 tools/testing/selftests/bpf/prog_tests/build_id.c | 118 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/test_build_id.c |  31 ++++++
 tools/testing/selftests/bpf/uprobe_multi.c        |  41 ++++++++
 tools/testing/selftests/bpf/uprobe_multi.ld       |  11 ++
 5 files changed, 204 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/build_id.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_build_id.c
 create mode 100644 tools/testing/selftests/bpf/uprobe_multi.ld

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 04716a5e43f1..f04af11df8eb 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -845,9 +845,10 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o
 
 # Linking uprobe_multi can fail due to relocation overflows on mips.
 $(OUTPUT)/uprobe_multi: CFLAGS += $(if $(filter mips, $(ARCH)),-mxgot)
-$(OUTPUT)/uprobe_multi: uprobe_multi.c
+$(OUTPUT)/uprobe_multi: uprobe_multi.c uprobe_multi.ld
 	$(call msg,BINARY,,$@)
-	$(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@
+	$(Q)$(CC) $(CFLAGS) -Wl,-T,uprobe_multi.ld -O0 $(LDFLAGS) 	\
+		$(filter-out %.ld,$^) $(LDLIBS) -o $@
 
 EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)			\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
diff --git a/tools/testing/selftests/bpf/prog_tests/build_id.c b/tools/testing/selftests/bpf/prog_tests/build_id.c
new file mode 100644
index 000000000000..aec9c8d6bc96
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/build_id.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+
+#include "test_build_id.skel.h"
+
+static char build_id[BPF_BUILD_ID_SIZE];
+static int build_id_sz;
+
+static void print_stack(struct bpf_stack_build_id *stack, int frame_cnt)
+{
+	int i, j;
+
+	for (i = 0; i < frame_cnt; i++) {
+		printf("FRAME #%02d: ", i);
+		switch (stack[i].status) {
+		case BPF_STACK_BUILD_ID_EMPTY:
+			printf("<EMPTY>\n");
+			break;
+		case BPF_STACK_BUILD_ID_VALID:
+			printf("BUILD ID = ");
+			for (j = 0; j < BPF_BUILD_ID_SIZE; j++)
+				printf("%02hhx", (unsigned)stack[i].build_id[j]);
+			printf(" OFFSET = %llx", (unsigned long long)stack[i].offset);
+			break;
+		case BPF_STACK_BUILD_ID_IP:
+			printf("IP = %llx", (unsigned long long)stack[i].ip);
+			break;
+		default:
+			printf("UNEXPECTED STATUS %d ", stack[i].status);
+			break;
+		}
+		printf("\n");
+	}
+}
+
+static void subtest_nofault(bool build_id_resident)
+{
+	struct test_build_id *skel;
+	struct bpf_stack_build_id *stack;
+	int frame_cnt;
+
+	skel = test_build_id__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	skel->links.uprobe_nofault = bpf_program__attach(skel->progs.uprobe_nofault);
+	if (!ASSERT_OK_PTR(skel->links.uprobe_nofault, "link"))
+		goto cleanup;
+
+	if (build_id_resident)
+		ASSERT_OK(system("./uprobe_multi uprobe-paged-in"), "trigger_uprobe");
+	else
+		ASSERT_OK(system("./uprobe_multi uprobe-paged-out"), "trigger_uprobe");
+
+	if (!ASSERT_GT(skel->bss->res_nofault, 0, "res"))
+		goto cleanup;
+
+	stack = skel->bss->stack_nofault;
+	frame_cnt = skel->bss->res_nofault / sizeof(struct bpf_stack_build_id);
+	if (env.verbosity >= VERBOSE_NORMAL)
+		print_stack(stack, frame_cnt);
+
+	if (build_id_resident) {
+		ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_VALID, "build_id_status");
+		ASSERT_EQ(memcmp(stack[0].build_id, build_id, build_id_sz), 0, "build_id_match");
+	} else {
+		ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_IP, "build_id_status");
+	}
+
+cleanup:
+	test_build_id__destroy(skel);
+}
+
+static void subtest_sleepable(void)
+{
+	struct test_build_id *skel;
+	struct bpf_stack_build_id *stack;
+	int frame_cnt;
+
+	skel = test_build_id__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	skel->links.uprobe_sleepable = bpf_program__attach(skel->progs.uprobe_sleepable);
+	if (!ASSERT_OK_PTR(skel->links.uprobe_sleepable, "link"))
+		goto cleanup;
+
+	/* force build ID to not be paged in */
+	ASSERT_OK(system("./uprobe_multi uprobe-paged-out"), "trigger_uprobe");
+
+	if (!ASSERT_GT(skel->bss->res_sleepable, 0, "res"))
+		goto cleanup;
+
+	stack = skel->bss->stack_sleepable;
+	frame_cnt = skel->bss->res_sleepable / sizeof(struct bpf_stack_build_id);
+	if (env.verbosity >= VERBOSE_NORMAL)
+		print_stack(stack, frame_cnt);
+
+	ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_VALID, "build_id_status");
+	ASSERT_EQ(memcmp(stack[0].build_id, build_id, build_id_sz), 0, "build_id_match");
+
+cleanup:
+	test_build_id__destroy(skel);
+}
+
+void serial_test_build_id(void)
+{
+	build_id_sz = read_build_id("uprobe_multi", build_id, sizeof(build_id));
+	ASSERT_EQ(build_id_sz, BPF_BUILD_ID_SIZE, "parse_build_id");
+
+	if (test__start_subtest("nofault-paged-out"))
+		subtest_nofault(false /* not resident */);
+	if (test__start_subtest("nofault-paged-in"))
+		subtest_nofault(true /* resident */);
+	if (test__start_subtest("sleepable"))
+		subtest_sleepable();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_build_id.c b/tools/testing/selftests/bpf/progs/test_build_id.c
new file mode 100644
index 000000000000..32ce59f9aa27
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_build_id.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct bpf_stack_build_id stack_sleepable[128];
+int res_sleepable;
+
+struct bpf_stack_build_id stack_nofault[128];
+int res_nofault;
+
+SEC("uprobe.multi/./uprobe_multi:uprobe")
+int uprobe_nofault(struct pt_regs *ctx)
+{
+	res_nofault = bpf_get_stack(ctx, stack_nofault, sizeof(stack_nofault),
+				    BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+
+	return 0;
+}
+
+SEC("uprobe.multi.s/./uprobe_multi:uprobe")
+int uprobe_sleepable(struct pt_regs *ctx)
+{
+	res_sleepable = bpf_get_stack(ctx, stack_sleepable, sizeof(stack_sleepable),
+				      BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c
index 7ffa563ffeba..c7828b13e5ff 100644
--- a/tools/testing/selftests/bpf/uprobe_multi.c
+++ b/tools/testing/selftests/bpf/uprobe_multi.c
@@ -2,8 +2,21 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <unistd.h>
 #include <sdt.h>
 
+#ifndef MADV_POPULATE_READ
+#define MADV_POPULATE_READ 22
+#endif
+
+int __attribute__((weak)) uprobe(void)
+{
+	return 0;
+}
+
 #define __PASTE(a, b) a##b
 #define PASTE(a, b) __PASTE(a, b)
 
@@ -75,6 +88,30 @@ static int usdt(void)
 	return 0;
 }
 
+extern char build_id_start[];
+extern char build_id_end[];
+
+int __attribute__((weak)) trigger_uprobe(bool build_id_resident)
+{
+	int page_sz = sysconf(_SC_PAGESIZE);
+	void *addr;
+
+	/* page-align build ID start */
+	addr = (void *)((uintptr_t)&build_id_start & ~(page_sz - 1));
+
+	/* to guarantee MADV_PAGEOUT work reliably, we need to ensure that
+	 * memory range is mapped into current process, so we unconditionally
+	 * do MADV_POPULATE_READ, and then MADV_PAGEOUT, if necessary
+	 */
+	madvise(addr, page_sz, MADV_POPULATE_READ);
+	if (!build_id_resident)
+		madvise(addr, page_sz, MADV_PAGEOUT);
+
+	(void)uprobe();
+
+	return 0;
+}
+
 int main(int argc, char **argv)
 {
 	if (argc != 2)
@@ -84,6 +121,10 @@ int main(int argc, char **argv)
 		return bench();
 	if (!strcmp("usdt", argv[1]))
 		return usdt();
+	if (!strcmp("uprobe-paged-out", argv[1]))
+		return trigger_uprobe(false /* page-out build ID */);
+	if (!strcmp("uprobe-paged-in", argv[1]))
+		return trigger_uprobe(true /* page-in build ID */);
 
 error:
 	fprintf(stderr, "usage: %s <bench|usdt>\n", argv[0]);
diff --git a/tools/testing/selftests/bpf/uprobe_multi.ld b/tools/testing/selftests/bpf/uprobe_multi.ld
new file mode 100644
index 000000000000..a2e94828bc8c
--- /dev/null
+++ b/tools/testing/selftests/bpf/uprobe_multi.ld
@@ -0,0 +1,11 @@
+SECTIONS
+{
+	. = ALIGN(4096);
+	.note.gnu.build-id : { *(.note.gnu.build-id) }
+	. = ALIGN(4096);
+}
+INSERT AFTER .text;
+
+build_id_start = ADDR(.note.gnu.build-id);
+build_id_end = ADDR(.note.gnu.build-id) + SIZEOF(.note.gnu.build-id);
+
-- 
cgit v1.2.3


From 2897b1e2a2f44c16e15f8b875b976d9d3f536506 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Mon, 9 Sep 2024 15:34:31 -0700
Subject: selftests/bpf: Fix arena_atomics failure due to llvm change

llvm change [1] made a change such that __sync_fetch_and_{and,or,xor}()
will generate atomic_fetch_*() insns even if the return value is not used.
This is a deliberate choice to make sure barrier semantics are preserved
from source code to asm insn.

But the change in [1] caused arena_atomics selftest failure.

  test_arena_atomics:PASS:arena atomics skeleton open 0 nsec
  libbpf: prog 'and': BPF program load failed: Permission denied
  libbpf: prog 'and': -- BEGIN PROG LOAD LOG --
  arg#0 reference type('UNKNOWN ') size cannot be determined: -22
  0: R1=ctx() R10=fp0
  ; if (pid != (bpf_get_current_pid_tgid() >> 32)) @ arena_atomics.c:87
  0: (18) r1 = 0xffffc90000064000       ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4)
  2: (61) r6 = *(u32 *)(r1 +0)          ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4) R6_w=scalar(smin=0,smax=umax=0xffffffff,v
ar_off=(0x0; 0xffffffff))
  3: (85) call bpf_get_current_pid_tgid#14      ; R0_w=scalar()
  4: (77) r0 >>= 32                     ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
  5: (5d) if r0 != r6 goto pc+11        ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0x)
  ; __sync_fetch_and_and(&and64_value, 0x011ull << 32); @ arena_atomics.c:91
  6: (18) r1 = 0x100000000060           ; R1_w=scalar()
  8: (bf) r1 = addr_space_cast(r1, 0, 1)        ; R1_w=arena
  9: (18) r2 = 0x1100000000             ; R2_w=0x1100000000
  11: (db) r2 = atomic64_fetch_and((u64 *)(r1 +0), r2)
  BPF_ATOMIC stores into R1 arena is not allowed
  processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
  -- END PROG LOAD LOG --
  libbpf: prog 'and': failed to load: -13
  libbpf: failed to load object 'arena_atomics'
  libbpf: failed to load BPF skeleton 'arena_atomics': -13
  test_arena_atomics:FAIL:arena atomics skeleton load unexpected error: -13 (errno 13)
  #3       arena_atomics:FAIL

The reason of the failure is due to [2] where atomic{64,}_fetch_{and,or,xor}() are not
allowed by arena addresses.

Version 2 of the patch fixed the issue by using inline asm ([3]). But further discussion
suggested to find a way from source to generate locked insn which is more user
friendly. So in not-merged llvm patch ([4]), if relax memory ordering is used and
the return value is not used, locked insn could be generated.

So with llvm patch [4] to compile the bpf selftest, the following code
  __c11_atomic_fetch_and(&and64_value, 0x011ull << 32, memory_order_relaxed);
is able to generate locked insn, hence fixing the selftest failure.

  [1] https://github.com/llvm/llvm-project/pull/106494
  [2] d503a04f8bc0 ("bpf: Add support for certain atomics in bpf_arena to x86 JIT")
  [3] https://lore.kernel.org/bpf/20240803025928.4184433-1-yonghong.song@linux.dev/
  [4] https://github.com/llvm/llvm-project/pull/107343

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240909223431.1666305-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/arena_atomics.c | 32 ++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c
index bb0acd79d28a..40dd57fca5cc 100644
--- a/tools/testing/selftests/bpf/progs/arena_atomics.c
+++ b/tools/testing/selftests/bpf/progs/arena_atomics.c
@@ -4,6 +4,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <stdbool.h>
+#include <stdatomic.h>
 #include "bpf_arena_common.h"
 
 struct {
@@ -77,8 +78,13 @@ int sub(const void *ctx)
 	return 0;
 }
 
+#ifdef __BPF_FEATURE_ATOMIC_MEM_ORDERING
+_Atomic __u64 __arena_global and64_value = (0x110ull << 32);
+_Atomic __u32 __arena_global and32_value = 0x110;
+#else
 __u64 __arena_global and64_value = (0x110ull << 32);
 __u32 __arena_global and32_value = 0x110;
+#endif
 
 SEC("raw_tp/sys_enter")
 int and(const void *ctx)
@@ -86,16 +92,25 @@ int and(const void *ctx)
 	if (pid != (bpf_get_current_pid_tgid() >> 32))
 		return 0;
 #ifdef ENABLE_ATOMICS_TESTS
-
+#ifdef __BPF_FEATURE_ATOMIC_MEM_ORDERING
+	__c11_atomic_fetch_and(&and64_value, 0x011ull << 32, memory_order_relaxed);
+	__c11_atomic_fetch_and(&and32_value, 0x011, memory_order_relaxed);
+#else
 	__sync_fetch_and_and(&and64_value, 0x011ull << 32);
 	__sync_fetch_and_and(&and32_value, 0x011);
+#endif
 #endif
 
 	return 0;
 }
 
+#ifdef __BPF_FEATURE_ATOMIC_MEM_ORDERING
+_Atomic __u32 __arena_global or32_value = 0x110;
+_Atomic __u64 __arena_global or64_value = (0x110ull << 32);
+#else
 __u32 __arena_global or32_value = 0x110;
 __u64 __arena_global or64_value = (0x110ull << 32);
+#endif
 
 SEC("raw_tp/sys_enter")
 int or(const void *ctx)
@@ -103,15 +118,25 @@ int or(const void *ctx)
 	if (pid != (bpf_get_current_pid_tgid() >> 32))
 		return 0;
 #ifdef ENABLE_ATOMICS_TESTS
+#ifdef __BPF_FEATURE_ATOMIC_MEM_ORDERING
+	__c11_atomic_fetch_or(&or64_value, 0x011ull << 32, memory_order_relaxed);
+	__c11_atomic_fetch_or(&or32_value, 0x011, memory_order_relaxed);
+#else
 	__sync_fetch_and_or(&or64_value, 0x011ull << 32);
 	__sync_fetch_and_or(&or32_value, 0x011);
+#endif
 #endif
 
 	return 0;
 }
 
+#ifdef __BPF_FEATURE_ATOMIC_MEM_ORDERING
+_Atomic __u64 __arena_global xor64_value = (0x110ull << 32);
+_Atomic __u32 __arena_global xor32_value = 0x110;
+#else
 __u64 __arena_global xor64_value = (0x110ull << 32);
 __u32 __arena_global xor32_value = 0x110;
+#endif
 
 SEC("raw_tp/sys_enter")
 int xor(const void *ctx)
@@ -119,8 +144,13 @@ int xor(const void *ctx)
 	if (pid != (bpf_get_current_pid_tgid() >> 32))
 		return 0;
 #ifdef ENABLE_ATOMICS_TESTS
+#ifdef __BPF_FEATURE_ATOMIC_MEM_ORDERING
+	__c11_atomic_fetch_xor(&xor64_value, 0x011ull << 32, memory_order_relaxed);
+	__c11_atomic_fetch_xor(&xor32_value, 0x011, memory_order_relaxed);
+#else
 	__sync_fetch_and_xor(&xor64_value, 0x011ull << 32);
 	__sync_fetch_and_xor(&xor32_value, 0x011);
+#endif
 #endif
 
 	return 0;
-- 
cgit v1.2.3


From 0c1019e3463b263a89e71d3b4543c28408ebe9a1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 11 Sep 2024 16:34:16 -0300
Subject: perf trace: Mark the 'rseq' arg in the rseq syscall as coming from
 user space

With that it uses the generic BTF based pretty printer:

  root@number:~# grep -w rseq /sys/kernel/tracing/events/syscalls/sys_enter_rseq/format
  	field:struct rseq * rseq;	offset:16;	size:8;	signed:0;
  print fmt: "rseq: 0x%08lx, rseq_len: 0x%08lx, flags: 0x%08lx, sig: 0x%08lx", ((unsigned long)(REC->rseq)), ((unsigned long)(REC->rseq_len)), ((unsigned long)(REC->flags)), ((unsigned long)(REC->sig))
  root@number:~#

Before:

  root@number:~# perf trace -e rseq
       0.000 ( 0.017 ms): Isolated Web C/1195452 rseq(rseq: 0x7ff0ecfe6fe0, rseq_len: 32, sig: 1392848979)             = 0
      74.018 ( 0.006 ms): :1195453/1195453 rseq(rseq: 0x7f2af20fffe0, rseq_len: 32, sig: 1392848979)             = 0
    1817.220 ( 0.009 ms): Isolated Web C/1195454 rseq(rseq: 0x7f5c9ec7dfe0, rseq_len: 32, sig: 1392848979)             = 0
    2515.526 ( 0.034 ms): :1195455/1195455 rseq(rseq: 0x7f61503fffe0, rseq_len: 32, sig: 1392848979)             = 0
  ^Croot@number:~#

After:

  root@number:~# perf trace -e rseq
       0.000 ( 0.019 ms): Isolated Web C/1197258 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)4,.cpu_id = (__u32)4,.mm_cid = (__u32)5,}, rseq_len: 32, sig: 1392848979) = 0
    1663.835 ( 0.019 ms): Isolated Web C/1197259 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)24,.cpu_id = (__u32)24,.mm_cid = (__u32)2,}, rseq_len: 32, sig: 1392848979) = 0
    4750.444 ( 0.018 ms): Isolated Web C/1197260 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)8,.cpu_id = (__u32)8,.mm_cid = (__u32)4,}, rseq_len: 32, sig: 1392848979) = 0
    4994.132 ( 0.018 ms): Isolated Web C/1197261 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)10,.cpu_id = (__u32)10,.mm_cid = (__u32)1,}, rseq_len: 32, sig: 1392848979) = 0
    4997.578 ( 0.011 ms): Isolated Web C/1197263 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)16,.cpu_id = (__u32)16,.mm_cid = (__u32)4,}, rseq_len: 32, sig: 1392848979) = 0
    4997.462 ( 0.014 ms): Isolated Web C/1197262 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)17,.cpu_id = (__u32)17,.mm_cid = (__u32)3,}, rseq_len: 32, sig: 1392848979) = 0
  ^Croot@number:~#

We'll probably need to come up with some way for using the BTF info to
synthesize a test that then gets used and captures the output of the
'perf trace' output to check if the arguments are the ones synthesized,
randomically, for now, lets make do manually:

  root@number:~# cat ~acme/c/rseq.c
  #include <sys/syscall.h>     /* Definition of SYS_* constants */
  #include <linux/rseq.h>
  #include <errno.h>
  #include <string.h>
  #include <unistd.h>
  #include <stdint.h>
  #include <stdio.h>

  /* Provide own rseq stub because glibc doesn't */
  __attribute__((weak))
  int sys_rseq(struct rseq *rseq, __u32 rseq_len, int flags, __u32 sig)
  {
  	return syscall(SYS_rseq, rseq, rseq_len, flags, sig);
  }

  int main(int argc, char *argv[])
  {
  	struct rseq rseq = {
  		.cpu_id_start = 12,
  		.cpu_id = 34,
  		.rseq_cs = 56,
  		.flags = 78,
  		.node_id = 90,
  		.mm_cid = 12,
  	};
  	int err = sys_rseq(&rseq, sizeof(rseq), 98765, 0xdeadbeaf);

  	printf("sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, %d, 0) = %d (%s)\n", sizeof(rseq), err, strerror(errno));
  	return err;
  }
  root@number:~# perf trace -e rseq ~acme/c/rseq
  sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, 32, 0) = -1 (Invalid argument)
       0.000 ( 0.003 ms): rseq/1200640 rseq(rseq: (struct rseq){}, rseq_len: 32, sig: 1392848979)            =
       0.064 ( 0.001 ms): rseq/1200640 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)12,.cpu_id = (__u32)34,.rseq_cs = (__u64)56,.flags = (__u32)78,.node_id = (__u32)90,.mm_cid = (__u32)12,}, rseq_len: 32, flags: 98765, sig: 3735928495) = -1 EINVAL (Invalid argument)
  root@number:~#root@number:~# cat ~acme/c/rseq.c
  #include <sys/syscall.h>     /* Definition of SYS_* constants */
  #include <linux/rseq.h>
  #include <errno.h>
  #include <string.h>
  #include <unistd.h>
  #include <stdint.h>
  #include <stdio.h>

  /* Provide own rseq stub because glibc doesn't */
  __attribute__((weak))
  int sys_rseq(struct rseq *rseq, __u32 rseq_len, int flags, __u32 sig)
  {
  	return syscall(SYS_rseq, rseq, rseq_len, flags, sig);
  }

  int main(int argc, char *argv[])
  {
  	struct rseq rseq = {
  		.cpu_id_start = 12,
  		.cpu_id = 34,
  		.rseq_cs = 56,
  		.flags = 78,
  		.node_id = 90,
  		.mm_cid = 12,
  	};
  	int err = sys_rseq(&rseq, sizeof(rseq), 98765, 0xdeadbeaf);

  	printf("sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, %d, 0) = %d (%s)\n", sizeof(rseq), err, strerror(errno));
  	return err;
  }
  root@number:~# perf trace -e rseq ~acme/c/rseq
  sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, 32, 0) = -1 (Invalid argument)
       0.000 ( 0.003 ms): rseq/1200640 rseq(rseq: (struct rseq){}, rseq_len: 32, sig: 1392848979)            =
       0.064 ( 0.001 ms): rseq/1200640 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)12,.cpu_id = (__u32)34,.rseq_cs = (__u64)56,.flags = (__u32)78,.node_id = (__u32)90,.mm_cid = (__u32)12,}, rseq_len: 32, flags: 98765, sig: 3735928495) = -1 EINVAL (Invalid argument)
  root@number:~#

Interesting, glibc seems to be using rseq here, as in addition to the
totally fake one this test case uses, we have this one, around these
other syscalls:

     0.175 ( 0.001 ms): rseq/1201095 set_tid_address(tidptr: 0x7f6def759a10)                               = 1201095 (rseq)
     0.177 ( 0.001 ms): rseq/1201095 set_robust_list(head: 0x7f6def759a20, len: 24)                        = 0
     0.178 ( 0.001 ms): rseq/1201095 rseq(rseq: (struct rseq){}, rseq_len: 32, sig: 1392848979)            =
     0.231 ( 0.005 ms): rseq/1201095 mprotect(start: 0x7f6def93f000, len: 16384, prot: READ)               = 0
     0.238 ( 0.003 ms): rseq/1201095 mprotect(start: 0x403000, len: 4096, prot: READ)                      = 0
     0.244 ( 0.004 ms): rseq/1201095 mprotect(start: 0x7f6def99c000, len: 8192, prot: READ)

Matches strace (well, not really as the strace in fedora:40 doesn't know
about rseq, printing just integer values in hex):

  set_robust_list(0x7fbc6acc7a20, 24)     = 0
  rseq(0x7fbc6acc8060, 0x20, 0, 0x53053053) = 0
  mprotect(0x7fbc6aead000, 16384, PROT_READ) = 0
  mprotect(0x403000, 4096, PROT_READ)     = 0
  mprotect(0x7fbc6af0a000, 8192, PROT_READ) = 0
  prlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
  munmap(0x7fbc6aebd000, 81563)           = 0
  rseq(0x7fff15bb9920, 0x20, 0x181cd, 0xdeadbeaf) = -1 EINVAL (Invalid argument)
  fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0x9), ...}) = 0
  getrandom("\xd0\x34\x97\x17\x61\xc2\x2b\x10", 8, GRND_NONBLOCK) = 8
  brk(NULL)                               = 0x18ff4000
  brk(0x19015000)                         = 0x19015000
  write(1, "sys_rseq({ .cpu_id_start = 12, ."..., 136sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, 32, 0) = -1 (Invalid argument)
  ) = 136
  exit_group(-1)                          = ?
  +++ exited with 255 +++
  root@number:~#

And also the focus for the v6.13 should be to have a better, strace
like BTF pretty printer as one of the outputs we can get from the libbpf
BTF dumper.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZuH2K1LLt1pIDkbd@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 3b88ba50bb80..27084ae38bc4 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1327,6 +1327,8 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
 		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
 		   [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
+	{ .name	    = "rseq",	    .errpid = true,
+	  .arg = { [0] = { .from_user = true /* rseq */, }, }, },
 	{ .name	    = "rt_sigaction",
 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 	{ .name	    = "rt_sigprocmask",
-- 
cgit v1.2.3


From 7eab3a58ac7b389517ae2241e47ed491f065baa0 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@gmail.com>
Date: Tue, 10 Sep 2024 22:41:11 +0800
Subject: bpf/selftests: Check errno when percpu map value size exceeds

This test case checks the errno message when percpu map value size
exceeds PCPU_MIN_UNIT_SIZE.

root@debian:~# ./test_maps
...
test_map_percpu_stats_hash_of_maps:PASS
test_map_percpu_stats_map_value_size:PASS
test_sk_storage_map:PASS

Signed-off-by: Jinke Han <jinkehan@didiglobal.com>
Signed-off-by: Tao Chen <chen.dylane@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240910144111.1464912-3-chen.dylane@gmail.com
---
 .../testing/selftests/bpf/map_tests/map_percpu_stats.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
index 2ea36408816b..1c7c04288eff 100644
--- a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
+++ b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
@@ -17,6 +17,7 @@
 #define MAX_ENTRIES_HASH_OF_MAPS	64
 #define N_THREADS			8
 #define MAX_MAP_KEY_SIZE		4
+#define PCPU_MIN_UNIT_SIZE		32768
 
 static void map_info(int map_fd, struct bpf_map_info *info)
 {
@@ -456,6 +457,22 @@ static void map_percpu_stats_hash_of_maps(void)
 	printf("test_%s:PASS\n", __func__);
 }
 
+static void map_percpu_stats_map_value_size(void)
+{
+	int fd;
+	int value_sz = PCPU_MIN_UNIT_SIZE + 1;
+	struct bpf_map_create_opts opts = { .sz = sizeof(opts) };
+	enum bpf_map_type map_types[] = { BPF_MAP_TYPE_PERCPU_ARRAY,
+					  BPF_MAP_TYPE_PERCPU_HASH,
+					  BPF_MAP_TYPE_LRU_PERCPU_HASH };
+	for (int i = 0; i < ARRAY_SIZE(map_types); i++) {
+		fd = bpf_map_create(map_types[i], NULL, sizeof(__u32), value_sz, 1, &opts);
+		CHECK(fd < 0 && errno != E2BIG, "percpu map value size",
+			"error: %s\n", strerror(errno));
+	}
+	printf("test_%s:PASS\n", __func__);
+}
+
 void test_map_percpu_stats(void)
 {
 	map_percpu_stats_hash();
@@ -467,4 +484,5 @@ void test_map_percpu_stats(void)
 	map_percpu_stats_percpu_lru_hash();
 	map_percpu_stats_percpu_lru_hash_no_common();
 	map_percpu_stats_hash_of_maps();
+	map_percpu_stats_map_value_size();
 }
-- 
cgit v1.2.3


From 1de5b5dcb8353f36581c963df2d359a5f151a0be Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 11 Sep 2024 17:10:33 -0300
Subject: perf trace: Mark the 'head' arg in the set_robust_list syscall as
 coming from user space

With that it uses the generic BTF based pretty printer:

This one we need to think about, not being acquainted with this syscall,
should we _traverse_ that list somehow? Would that be useful?

  root@number:~# perf trace -e set_robust_list sleep 1
       0.000 ( 0.004 ms): sleep/1206493 set_robust_list(head: (struct robust_list_head){.list = (struct robust_list){.next = (struct robust_list *)0x7f48a9a02a20,},.futex_offset = (long int)-32,}, len: 24) =
  root@number:~#

strace prints the default integer args:

  root@number:~# strace -e set_robust_list sleep 1
  set_robust_list(0x7efd99559a20, 24)     = 0
  +++ exited with 0 +++
  root@number:~#

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org
Link: https://lore.kernel.org/lkml/ZuH6MquMraBvODRp@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 27084ae38bc4..f6e847529073 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1351,6 +1351,8 @@ static const struct syscall_fmt syscall_fmts[] = {
 	{ .name	    = "sendto",
 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
 		   [4] = SCA_SOCKADDR_FROM_USER(addr), }, },
+	{ .name	    = "set_robust_list",	    .errpid = true,
+	  .arg = { [0] = { .from_user = true /* head */, }, }, },
 	{ .name	    = "set_tid_address", .errpid = true, },
 	{ .name	    = "setitimer",
 	  .arg = { [0] = STRARRAY(which, itimers), }, },
-- 
cgit v1.2.3


From 49ac6f05ace5bb0070c68a0193aa05d3c25d4c83 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 10 Sep 2024 21:06:36 +0200
Subject: selftests: mptcp: join: restrict fullmesh endp on 1st sf

A new endpoint using the IP of the initial subflow has been recently
added to increase the code coverage. But it breaks the test when using
old kernels not having commit 86e39e04482b ("mptcp: keep track of local
endpoint still available for each msk"), e.g. on v5.15.

Similar to commit d4c81bbb8600 ("selftests: mptcp: join: support local
endpoint being tracked or not"), it is possible to add the new endpoint
conditionally, by checking if "mptcp_pm_subflow_check_next" is present
in kallsyms: this is not directly linked to the commit introducing this
symbol but for the parent one which is linked anyway. So we can know in
advance what will be the expected behaviour, and add the new endpoint
only when it makes sense to do so.

Fixes: 4878f9f8421f ("selftests: mptcp: join: validate fullmesh endp on 1st sf")
Cc: stable@vger.kernel.org
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240910-net-selftests-mptcp-fix-install-v1-1-8f124aa9156d@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index a4762c49a878..cde041c93906 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3064,7 +3064,9 @@ fullmesh_tests()
 		pm_nl_set_limits $ns1 1 3
 		pm_nl_set_limits $ns2 1 3
 		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
-		pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh
+		if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then
+			pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh
+		fi
 		fullmesh=1 speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 3 3 3
-- 
cgit v1.2.3


From 1a5a2d19e827447e6a4d768504866e6820819cee Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 10 Sep 2024 21:06:37 +0200
Subject: selftests: mptcp: include lib.sh file

The lib.sh file from the parent directory is used by the MPTCP selftests
and it needs to be present when running the tests.

This file then needs to be listed in the Makefile to be included when
exporting or installing the tests, e.g. with:

  make -C tools/testing/selftests \
          TARGETS=net/mptcp \
          install INSTALL_PATH=$KSFT_INSTALL_PATH

  cd $KSFT_INSTALL_PATH
  ./run_kselftest.sh -c net/mptcp

Fixes: f265d3119a29 ("selftests: mptcp: lib: use setup/cleanup_ns helpers")
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240910-net-selftests-mptcp-fix-install-v1-2-8f124aa9156d@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index 7b936a926859..68924053a1e6 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -11,6 +11,8 @@ TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq
 
 TEST_FILES := mptcp_lib.sh settings
 
+TEST_INCLUDES := ../lib.sh
+
 EXTRA_CLEAN := *.pcap
 
 include ../../lib.mk
-- 
cgit v1.2.3


From c66c08e51b55c30ae333d6027e4dfda209710d46 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 10 Sep 2024 21:06:38 +0200
Subject: selftests: mptcp: include net_helper.sh file

Similar to the previous commit, the net_helper.sh file from the parent
directory is used by the MPTCP selftests and it needs to be present when
running the tests.

This file then needs to be listed in the Makefile to be included when
exporting or installing the tests, e.g. with:

  make -C tools/testing/selftests \
          TARGETS=net/mptcp \
          install INSTALL_PATH=$KSFT_INSTALL_PATH

  cd $KSFT_INSTALL_PATH
  ./run_kselftest.sh -c net/mptcp

Fixes: 1af3bc912eac ("selftests: mptcp: lib: use wait_local_port_listen helper")
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240910-net-selftests-mptcp-fix-install-v1-3-8f124aa9156d@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index 68924053a1e6..5d796622e730 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -11,7 +11,7 @@ TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq
 
 TEST_FILES := mptcp_lib.sh settings
 
-TEST_INCLUDES := ../lib.sh
+TEST_INCLUDES := ../lib.sh ../net_helper.sh
 
 EXTRA_CLEAN := *.pcap
 
-- 
cgit v1.2.3


From 3efd7ab46d0aebc3e567a9846e79a98dbad3291c Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Tue, 10 Sep 2024 17:14:46 +0000
Subject: net: netdev netlink api to bind dma-buf to a net device

API takes the dma-buf fd as input, and binds it to the netdevice. The
user can specify the rx queues to bind the dma-buf to.

Suggested-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-3-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml | 47 +++++++++++++++++++++++++++++++++
 include/uapi/linux/netdev.h             | 11 ++++++++
 net/core/netdev-genl-gen.c              | 19 +++++++++++++
 net/core/netdev-genl-gen.h              |  2 ++
 net/core/netdev-genl.c                  |  6 +++++
 tools/include/uapi/linux/netdev.h       | 11 ++++++++
 6 files changed, 96 insertions(+)

(limited to 'tools')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 959755be4d7f..4930e8142aa6 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -457,6 +457,39 @@ attribute-sets:
           Number of times driver re-started accepting send
           requests to this queue from the stack.
         type: uint
+  -
+    name: queue-id
+    subset-of: queue
+    attributes:
+      -
+        name: id
+      -
+        name: type
+  -
+    name: dmabuf
+    attributes:
+      -
+        name: ifindex
+        doc: netdev ifindex to bind the dmabuf to.
+        type: u32
+        checks:
+          min: 1
+      -
+        name: queues
+        doc: receive queues to bind the dmabuf to.
+        type: nest
+        nested-attributes: queue-id
+        multi-attr: true
+      -
+        name: fd
+        doc: dmabuf file descriptor to bind.
+        type: u32
+      -
+        name: id
+        doc: id of the dmabuf binding
+        type: u32
+        checks:
+          min: 1
 
 operations:
   list:
@@ -619,6 +652,20 @@ operations:
             - rx-bytes
             - tx-packets
             - tx-bytes
+    -
+      name: bind-rx
+      doc: Bind dmabuf to netdev
+      attribute-set: dmabuf
+      flags: [ admin-perm ]
+      do:
+        request:
+          attributes:
+            - ifindex
+            - fd
+            - queues
+        reply:
+          attributes:
+            - id
 
 mcast-groups:
   list:
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 43742ac5b00d..91bf3ecc5f1d 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -173,6 +173,16 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
+enum {
+	NETDEV_A_DMABUF_IFINDEX = 1,
+	NETDEV_A_DMABUF_QUEUES,
+	NETDEV_A_DMABUF_FD,
+	NETDEV_A_DMABUF_ID,
+
+	__NETDEV_A_DMABUF_MAX,
+	NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1)
+};
+
 enum {
 	NETDEV_CMD_DEV_GET = 1,
 	NETDEV_CMD_DEV_ADD_NTF,
@@ -186,6 +196,7 @@ enum {
 	NETDEV_CMD_QUEUE_GET,
 	NETDEV_CMD_NAPI_GET,
 	NETDEV_CMD_QSTATS_GET,
+	NETDEV_CMD_BIND_RX,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 8350a0afa9ec..6b7fe6035067 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -27,6 +27,11 @@ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFIND
 	[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
 };
 
+const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = {
+	[NETDEV_A_QUEUE_ID] = { .type = NLA_U32, },
+	[NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+};
+
 /* NETDEV_CMD_DEV_GET - do */
 static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
 	[NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
@@ -74,6 +79,13 @@ static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE
 	[NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1),
 };
 
+/* NETDEV_CMD_BIND_RX - do */
+static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+	[NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+	[NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+	[NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+};
+
 /* Ops table for netdev */
 static const struct genl_split_ops netdev_nl_ops[] = {
 	{
@@ -151,6 +163,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.maxattr	= NETDEV_A_QSTATS_SCOPE,
 		.flags		= GENL_CMD_CAP_DUMP,
 	},
+	{
+		.cmd		= NETDEV_CMD_BIND_RX,
+		.doit		= netdev_nl_bind_rx_doit,
+		.policy		= netdev_bind_rx_nl_policy,
+		.maxattr	= NETDEV_A_DMABUF_FD,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
 };
 
 static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index 4db40fd5b4a9..67c34005750c 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -13,6 +13,7 @@
 
 /* Common nested types */
 extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
+extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
 
 int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
@@ -30,6 +31,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
 				struct netlink_callback *cb);
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
 
 enum {
 	NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index a17d7eaeb001..699c34b9b03c 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -723,6 +723,12 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
 	return err;
 }
 
+/* Stub */
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	return 0;
+}
+
 static int netdev_genl_netdevice_event(struct notifier_block *nb,
 				       unsigned long event, void *ptr)
 {
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 43742ac5b00d..91bf3ecc5f1d 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -173,6 +173,16 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
+enum {
+	NETDEV_A_DMABUF_IFINDEX = 1,
+	NETDEV_A_DMABUF_QUEUES,
+	NETDEV_A_DMABUF_FD,
+	NETDEV_A_DMABUF_ID,
+
+	__NETDEV_A_DMABUF_MAX,
+	NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1)
+};
+
 enum {
 	NETDEV_CMD_DEV_GET = 1,
 	NETDEV_CMD_DEV_ADD_NTF,
@@ -186,6 +196,7 @@ enum {
 	NETDEV_CMD_QUEUE_GET,
 	NETDEV_CMD_NAPI_GET,
 	NETDEV_CMD_QSTATS_GET,
+	NETDEV_CMD_BIND_RX,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
-- 
cgit v1.2.3


From 85585b4bc8d80095a825028e2088b568035ac467 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Tue, 10 Sep 2024 17:14:56 +0000
Subject: selftests: add ncdevmem, netcat for devmem TCP

ncdevmem is a devmem TCP netcat. It works similarly to netcat, but it
sends and receives data using the devmem TCP APIs. It uses udmabuf as
the dmabuf provider. It is compatible with a regular netcat running on
a peer, or a ncdevmem running on a peer.

In addition to normal netcat support, ncdevmem has a validation mode,
where it sends a specific pattern and validates this pattern on the
receiver side to ensure data integrity.

Suggested-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20240910171458.219195-13-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/.gitignore           |   1 +
 tools/testing/selftests/net/.gitignore |   1 +
 tools/testing/selftests/net/Makefile   |   9 +
 tools/testing/selftests/net/ncdevmem.c | 570 +++++++++++++++++++++++++++++++++
 4 files changed, 581 insertions(+)
 create mode 100644 tools/testing/selftests/net/ncdevmem.c

(limited to 'tools')

diff --git a/tools/net/ynl/lib/.gitignore b/tools/net/ynl/lib/.gitignore
index c18dd8d83cee..296c4035dbf2 100644
--- a/tools/net/ynl/lib/.gitignore
+++ b/tools/net/ynl/lib/.gitignore
@@ -1 +1,2 @@
 __pycache__/
+*.d
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 923bf098e2eb..1c04c780db66 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -17,6 +17,7 @@ ipv6_flowlabel
 ipv6_flowlabel_mgr
 log.txt
 msg_zerocopy
+ncdevmem
 nettest
 psock_fanout
 psock_snd
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 22a5d6a7c3f3..649f1fe0dc46 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -97,6 +97,11 @@ TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
 
+# YNL files, must be before "include ..lib.mk"
+EXTRA_CLEAN += $(OUTPUT)/libynl.a
+YNL_GEN_FILES := ncdevmem
+TEST_GEN_FILES += $(YNL_GEN_FILES)
+
 TEST_FILES := settings
 TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
 
@@ -106,6 +111,10 @@ TEST_INCLUDES := forwarding/lib.sh
 
 include ../lib.mk
 
+# YNL build
+YNL_GENS := netdev
+include ynl.mk
+
 $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
new file mode 100644
index 000000000000..64d6805381c5
--- /dev/null
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -0,0 +1,570 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <linux/uio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#define __iovec_defined
+#include <fcntl.h>
+#include <malloc.h>
+#include <error.h>
+
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include <linux/memfd.h>
+#include <linux/dma-buf.h>
+#include <linux/udmabuf.h>
+#include <libmnl/libmnl.h>
+#include <linux/types.h>
+#include <linux/netlink.h>
+#include <linux/genetlink.h>
+#include <linux/netdev.h>
+#include <time.h>
+#include <net/if.h>
+
+#include "netdev-user.h"
+#include <ynl.h>
+
+#define PAGE_SHIFT 12
+#define TEST_PREFIX "ncdevmem"
+#define NUM_PAGES 16000
+
+#ifndef MSG_SOCK_DEVMEM
+#define MSG_SOCK_DEVMEM 0x2000000
+#endif
+
+/*
+ * tcpdevmem netcat. Works similarly to netcat but does device memory TCP
+ * instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
+ *
+ * Usage:
+ *
+ *	On server:
+ *	ncdevmem -s <server IP> -c <client IP> -f eth1 -l -p 5201 -v 7
+ *
+ *	On client:
+ *	yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
+ *		tr \\n \\0 | \
+ *		head -c 5G | \
+ *		nc <server IP> 5201 -p 5201
+ *
+ * Note this is compatible with regular netcat. i.e. the sender or receiver can
+ * be replaced with regular netcat to test the RX or TX path in isolation.
+ */
+
+static char *server_ip = "192.168.1.4";
+static char *client_ip = "192.168.1.2";
+static char *port = "5201";
+static size_t do_validation;
+static int start_queue = 8;
+static int num_queues = 8;
+static char *ifname = "eth1";
+static unsigned int ifindex;
+static unsigned int dmabuf_id;
+
+void print_bytes(void *ptr, size_t size)
+{
+	unsigned char *p = ptr;
+	int i;
+
+	for (i = 0; i < size; i++)
+		printf("%02hhX ", p[i]);
+	printf("\n");
+}
+
+void print_nonzero_bytes(void *ptr, size_t size)
+{
+	unsigned char *p = ptr;
+	unsigned int i;
+
+	for (i = 0; i < size; i++)
+		putchar(p[i]);
+	printf("\n");
+}
+
+void validate_buffer(void *line, size_t size)
+{
+	static unsigned char seed = 1;
+	unsigned char *ptr = line;
+	int errors = 0;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if (ptr[i] != seed) {
+			fprintf(stderr,
+				"Failed validation: expected=%u, actual=%u, index=%lu\n",
+				seed, ptr[i], i);
+			errors++;
+			if (errors > 20)
+				error(1, 0, "validation failed.");
+		}
+		seed++;
+		if (seed == do_validation)
+			seed = 0;
+	}
+
+	fprintf(stdout, "Validated buffer\n");
+}
+
+#define run_command(cmd, ...)                                           \
+	({                                                              \
+		char command[256];                                      \
+		memset(command, 0, sizeof(command));                    \
+		snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \
+		printf("Running: %s\n", command);                       \
+		system(command);                                        \
+	})
+
+static int reset_flow_steering(void)
+{
+	int ret = 0;
+
+	ret = run_command("sudo ethtool -K %s ntuple off", ifname);
+	if (ret)
+		return ret;
+
+	return run_command("sudo ethtool -K %s ntuple on", ifname);
+}
+
+static int configure_headersplit(bool on)
+{
+	return run_command("sudo ethtool -G %s tcp-data-split %s", ifname,
+			   on ? "on" : "off");
+}
+
+static int configure_rss(void)
+{
+	return run_command("sudo ethtool -X %s equal %d", ifname, start_queue);
+}
+
+static int configure_channels(unsigned int rx, unsigned int tx)
+{
+	return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx);
+}
+
+static int configure_flow_steering(void)
+{
+	return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d",
+			   ifname, client_ip, server_ip, port, port, start_queue);
+}
+
+static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
+			 struct netdev_queue_id *queues,
+			 unsigned int n_queue_index, struct ynl_sock **ys)
+{
+	struct netdev_bind_rx_req *req = NULL;
+	struct netdev_bind_rx_rsp *rsp = NULL;
+	struct ynl_error yerr;
+
+	*ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+	if (!*ys) {
+		fprintf(stderr, "YNL: %s\n", yerr.msg);
+		return -1;
+	}
+
+	req = netdev_bind_rx_req_alloc();
+	netdev_bind_rx_req_set_ifindex(req, ifindex);
+	netdev_bind_rx_req_set_fd(req, dmabuf_fd);
+	__netdev_bind_rx_req_set_queues(req, queues, n_queue_index);
+
+	rsp = netdev_bind_rx(*ys, req);
+	if (!rsp) {
+		perror("netdev_bind_rx");
+		goto err_close;
+	}
+
+	if (!rsp->_present.id) {
+		perror("id not present");
+		goto err_close;
+	}
+
+	printf("got dmabuf id=%d\n", rsp->id);
+	dmabuf_id = rsp->id;
+
+	netdev_bind_rx_req_free(req);
+	netdev_bind_rx_rsp_free(rsp);
+
+	return 0;
+
+err_close:
+	fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg);
+	netdev_bind_rx_req_free(req);
+	ynl_sock_destroy(*ys);
+	return -1;
+}
+
+static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size)
+{
+	struct udmabuf_create create;
+	int ret;
+
+	*devfd = open("/dev/udmabuf", O_RDWR);
+	if (*devfd < 0) {
+		error(70, 0,
+		      "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
+		      TEST_PREFIX);
+	}
+
+	*memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
+	if (*memfd < 0)
+		error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX);
+
+	/* Required for udmabuf */
+	ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+	if (ret < 0)
+		error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+
+	ret = ftruncate(*memfd, dmabuf_size);
+	if (ret == -1)
+		error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+
+	memset(&create, 0, sizeof(create));
+
+	create.memfd = *memfd;
+	create.offset = 0;
+	create.size = dmabuf_size;
+	*buf = ioctl(*devfd, UDMABUF_CREATE, &create);
+	if (*buf < 0)
+		error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
+}
+
+int do_server(void)
+{
+	char ctrl_data[sizeof(int) * 20000];
+	struct netdev_queue_id *queues;
+	size_t non_page_aligned_frags = 0;
+	struct sockaddr_in client_addr;
+	struct sockaddr_in server_sin;
+	size_t page_aligned_frags = 0;
+	int devfd, memfd, buf, ret;
+	size_t total_received = 0;
+	socklen_t client_addr_len;
+	bool is_devmem = false;
+	char *buf_mem = NULL;
+	struct ynl_sock *ys;
+	size_t dmabuf_size;
+	char iobuf[819200];
+	char buffer[256];
+	int socket_fd;
+	int client_fd;
+	size_t i = 0;
+	int opt = 1;
+
+	dmabuf_size = getpagesize() * NUM_PAGES;
+
+	create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
+
+	if (reset_flow_steering())
+		error(1, 0, "Failed to reset flow steering\n");
+
+	/* Configure RSS to divert all traffic from our devmem queues */
+	if (configure_rss())
+		error(1, 0, "Failed to configure rss\n");
+
+	/* Flow steer our devmem flows to start_queue */
+	if (configure_flow_steering())
+		error(1, 0, "Failed to configure flow steering\n");
+
+	sleep(1);
+
+	queues = malloc(sizeof(*queues) * num_queues);
+
+	for (i = 0; i < num_queues; i++) {
+		queues[i]._present.type = 1;
+		queues[i]._present.id = 1;
+		queues[i].type = NETDEV_QUEUE_TYPE_RX;
+		queues[i].id = start_queue + i;
+	}
+
+	if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+		error(1, 0, "Failed to bind\n");
+
+	buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+		       buf, 0);
+	if (buf_mem == MAP_FAILED)
+		error(1, 0, "mmap()");
+
+	server_sin.sin_family = AF_INET;
+	server_sin.sin_port = htons(atoi(port));
+
+	ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr);
+	if (socket < 0)
+		error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX);
+
+	socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0);
+	if (socket < 0)
+		error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
+
+	ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt,
+			 sizeof(opt));
+	if (ret)
+		error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
+
+	ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt,
+			 sizeof(opt));
+	if (ret)
+		error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
+
+	printf("binding to address %s:%d\n", server_ip,
+	       ntohs(server_sin.sin_port));
+
+	ret = bind(socket_fd, &server_sin, sizeof(server_sin));
+	if (ret)
+		error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
+
+	ret = listen(socket_fd, 1);
+	if (ret)
+		error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
+
+	client_addr_len = sizeof(client_addr);
+
+	inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer,
+		  sizeof(buffer));
+	printf("Waiting or connection on %s:%d\n", buffer,
+	       ntohs(server_sin.sin_port));
+	client_fd = accept(socket_fd, &client_addr, &client_addr_len);
+
+	inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer,
+		  sizeof(buffer));
+	printf("Got connection from %s:%d\n", buffer,
+	       ntohs(client_addr.sin_port));
+
+	while (1) {
+		struct iovec iov = { .iov_base = iobuf,
+				     .iov_len = sizeof(iobuf) };
+		struct dmabuf_cmsg *dmabuf_cmsg = NULL;
+		struct dma_buf_sync sync = { 0 };
+		struct cmsghdr *cm = NULL;
+		struct msghdr msg = { 0 };
+		struct dmabuf_token token;
+		ssize_t ret;
+
+		is_devmem = false;
+		printf("\n\n");
+
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = ctrl_data;
+		msg.msg_controllen = sizeof(ctrl_data);
+		ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM);
+		printf("recvmsg ret=%ld\n", ret);
+		if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+			continue;
+		if (ret < 0) {
+			perror("recvmsg");
+			continue;
+		}
+		if (ret == 0) {
+			printf("client exited\n");
+			goto cleanup;
+		}
+
+		i++;
+		for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
+			if (cm->cmsg_level != SOL_SOCKET ||
+			    (cm->cmsg_type != SCM_DEVMEM_DMABUF &&
+			     cm->cmsg_type != SCM_DEVMEM_LINEAR)) {
+				fprintf(stdout, "skipping non-devmem cmsg\n");
+				continue;
+			}
+
+			dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
+			is_devmem = true;
+
+			if (cm->cmsg_type == SCM_DEVMEM_LINEAR) {
+				/* TODO: process data copied from skb's linear
+				 * buffer.
+				 */
+				fprintf(stdout,
+					"SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
+					dmabuf_cmsg->frag_size);
+
+				continue;
+			}
+
+			token.token_start = dmabuf_cmsg->frag_token;
+			token.token_count = 1;
+
+			total_received += dmabuf_cmsg->frag_size;
+			printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
+			       dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
+			       dmabuf_cmsg->frag_offset % getpagesize(),
+			       dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size,
+			       dmabuf_cmsg->frag_token, total_received,
+			       dmabuf_cmsg->dmabuf_id);
+
+			if (dmabuf_cmsg->dmabuf_id != dmabuf_id)
+				error(1, 0,
+				      "received on wrong dmabuf_id: flow steering error\n");
+
+			if (dmabuf_cmsg->frag_size % getpagesize())
+				non_page_aligned_frags++;
+			else
+				page_aligned_frags++;
+
+			sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
+			ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+			if (do_validation)
+				validate_buffer(
+					((unsigned char *)buf_mem) +
+						dmabuf_cmsg->frag_offset,
+					dmabuf_cmsg->frag_size);
+			else
+				print_nonzero_bytes(
+					((unsigned char *)buf_mem) +
+						dmabuf_cmsg->frag_offset,
+					dmabuf_cmsg->frag_size);
+
+			sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
+			ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+			ret = setsockopt(client_fd, SOL_SOCKET,
+					 SO_DEVMEM_DONTNEED, &token,
+					 sizeof(token));
+			if (ret != 1)
+				error(1, 0,
+				      "SO_DEVMEM_DONTNEED not enough tokens");
+		}
+		if (!is_devmem)
+			error(1, 0, "flow steering error\n");
+
+		printf("total_received=%lu\n", total_received);
+	}
+
+	fprintf(stdout, "%s: ok\n", TEST_PREFIX);
+
+	fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
+		page_aligned_frags, non_page_aligned_frags);
+
+	fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
+		page_aligned_frags, non_page_aligned_frags);
+
+cleanup:
+
+	munmap(buf_mem, dmabuf_size);
+	close(client_fd);
+	close(socket_fd);
+	close(buf);
+	close(memfd);
+	close(devfd);
+	ynl_sock_destroy(ys);
+
+	return 0;
+}
+
+void run_devmem_tests(void)
+{
+	struct netdev_queue_id *queues;
+	int devfd, memfd, buf;
+	struct ynl_sock *ys;
+	size_t dmabuf_size;
+	size_t i = 0;
+
+	dmabuf_size = getpagesize() * NUM_PAGES;
+
+	create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
+
+	/* Configure RSS to divert all traffic from our devmem queues */
+	if (configure_rss())
+		error(1, 0, "rss error\n");
+
+	queues = calloc(num_queues, sizeof(*queues));
+
+	if (configure_headersplit(1))
+		error(1, 0, "Failed to configure header split\n");
+
+	if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+		error(1, 0, "Binding empty queues array should have failed\n");
+
+	for (i = 0; i < num_queues; i++) {
+		queues[i]._present.type = 1;
+		queues[i]._present.id = 1;
+		queues[i].type = NETDEV_QUEUE_TYPE_RX;
+		queues[i].id = start_queue + i;
+	}
+
+	if (configure_headersplit(0))
+		error(1, 0, "Failed to configure header split\n");
+
+	if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+		error(1, 0, "Configure dmabuf with header split off should have failed\n");
+
+	if (configure_headersplit(1))
+		error(1, 0, "Failed to configure header split\n");
+
+	for (i = 0; i < num_queues; i++) {
+		queues[i]._present.type = 1;
+		queues[i]._present.id = 1;
+		queues[i].type = NETDEV_QUEUE_TYPE_RX;
+		queues[i].id = start_queue + i;
+	}
+
+	if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+		error(1, 0, "Failed to bind\n");
+
+	/* Deactivating a bound queue should not be legal */
+	if (!configure_channels(num_queues, num_queues - 1))
+		error(1, 0, "Deactivating a bound queue should be illegal.\n");
+
+	/* Closing the netlink socket does an implicit unbind */
+	ynl_sock_destroy(ys);
+}
+
+int main(int argc, char *argv[])
+{
+	int is_server = 0, opt;
+
+	while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) {
+		switch (opt) {
+		case 'l':
+			is_server = 1;
+			break;
+		case 's':
+			server_ip = optarg;
+			break;
+		case 'c':
+			client_ip = optarg;
+			break;
+		case 'p':
+			port = optarg;
+			break;
+		case 'v':
+			do_validation = atoll(optarg);
+			break;
+		case 'q':
+			num_queues = atoi(optarg);
+			break;
+		case 't':
+			start_queue = atoi(optarg);
+			break;
+		case 'f':
+			ifname = optarg;
+			break;
+		case '?':
+			printf("unknown option: %c\n", optopt);
+			break;
+		}
+	}
+
+	ifindex = if_nametoindex(ifname);
+
+	for (; optind < argc; optind++)
+		printf("extra arguments: %s\n", argv[optind]);
+
+	run_devmem_tests();
+
+	if (is_server)
+		return do_server();
+
+	return 0;
+}
-- 
cgit v1.2.3


From d0caf9876a1c9f844307effb598ad1312d9e0025 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Tue, 10 Sep 2024 17:14:57 +0000
Subject: netdev: add dmabuf introspection

Add dmabuf information to page_pool stats:

$ ./cli.py --spec ../netlink/specs/netdev.yaml --dump page-pool-get
...
 {'dmabuf': 10,
  'id': 456,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 455,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 454,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 453,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 452,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 451,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 450,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 449,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},

And queue stats:

$ ./cli.py --spec ../netlink/specs/netdev.yaml --dump queue-get
...
{'dmabuf': 10, 'id': 8, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 9, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 10, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 11, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 12, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 13, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 14, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 15, 'ifindex': 3, 'type': 'rx'},

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-14-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml | 10 ++++++++++
 include/uapi/linux/netdev.h             |  2 ++
 net/core/netdev-genl.c                  |  7 +++++++
 net/core/page_pool_user.c               |  5 +++++
 tools/include/uapi/linux/netdev.h       |  2 ++
 5 files changed, 26 insertions(+)

(limited to 'tools')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 0c747530c275..08412c279297 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -167,6 +167,10 @@ attribute-sets:
           "re-attached", they are just waiting to disappear.
           Attribute is absent if Page Pool has not been detached, and
           can still be used to allocate new memory.
+      -
+        name: dmabuf
+        doc: ID of the dmabuf this page-pool is attached to.
+        type: u32
   -
     name: page-pool-info
     subset-of: page-pool
@@ -268,6 +272,10 @@ attribute-sets:
         name: napi-id
         doc: ID of the NAPI instance which services this queue.
         type: u32
+      -
+        name: dmabuf
+        doc: ID of the dmabuf attached to this queue, if any.
+        type: u32
 
   -
     name: qstats
@@ -543,6 +551,7 @@ operations:
             - inflight
             - inflight-mem
             - detach-time
+            - dmabuf
       dump:
         reply: *pp-reply
       config-cond: page-pool
@@ -607,6 +616,7 @@ operations:
             - type
             - napi-id
             - ifindex
+            - dmabuf
       dump:
         request:
           attributes:
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 91bf3ecc5f1d..7c308f04e7a0 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -93,6 +93,7 @@ enum {
 	NETDEV_A_PAGE_POOL_INFLIGHT,
 	NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
 	NETDEV_A_PAGE_POOL_DETACH_TIME,
+	NETDEV_A_PAGE_POOL_DMABUF,
 
 	__NETDEV_A_PAGE_POOL_MAX,
 	NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1)
@@ -131,6 +132,7 @@ enum {
 	NETDEV_A_QUEUE_IFINDEX,
 	NETDEV_A_QUEUE_TYPE,
 	NETDEV_A_QUEUE_NAPI_ID,
+	NETDEV_A_QUEUE_DMABUF,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 9153a8ab0cf8..1cb954f2d39e 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -295,6 +295,7 @@ static int
 netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 			 u32 q_idx, u32 q_type, const struct genl_info *info)
 {
+	struct net_devmem_dmabuf_binding *binding;
 	struct netdev_rx_queue *rxq;
 	struct netdev_queue *txq;
 	void *hdr;
@@ -314,6 +315,12 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 		if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID,
 					     rxq->napi->napi_id))
 			goto nla_put_failure;
+
+		binding = rxq->mp_params.mp_priv;
+		if (binding &&
+		    nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id))
+			goto nla_put_failure;
+
 		break;
 	case NETDEV_QUEUE_TYPE_TX:
 		txq = netdev_get_tx_queue(netdev, q_idx);
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
index cd6267ba6fa3..48335766c1bf 100644
--- a/net/core/page_pool_user.c
+++ b/net/core/page_pool_user.c
@@ -9,6 +9,7 @@
 #include <net/page_pool/types.h>
 #include <net/sock.h>
 
+#include "devmem.h"
 #include "page_pool_priv.h"
 #include "netdev-genl-gen.h"
 
@@ -213,6 +214,7 @@ static int
 page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
 		  const struct genl_info *info)
 {
+	struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
 	size_t inflight, refsz;
 	void *hdr;
 
@@ -242,6 +244,9 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
 			 pool->user.detach_time))
 		goto err_cancel;
 
+	if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
+		goto err_cancel;
+
 	genlmsg_end(rsp, hdr);
 
 	return 0;
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 91bf3ecc5f1d..7c308f04e7a0 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -93,6 +93,7 @@ enum {
 	NETDEV_A_PAGE_POOL_INFLIGHT,
 	NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
 	NETDEV_A_PAGE_POOL_DETACH_TIME,
+	NETDEV_A_PAGE_POOL_DMABUF,
 
 	__NETDEV_A_PAGE_POOL_MAX,
 	NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1)
@@ -131,6 +132,7 @@ enum {
 	NETDEV_A_QUEUE_IFINDEX,
 	NETDEV_A_QUEUE_TYPE,
 	NETDEV_A_QUEUE_NAPI_ID,
+	NETDEV_A_QUEUE_DMABUF,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
-- 
cgit v1.2.3


From 8d8081cecfb9940beeb4a8a700db34e615a96056 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 5 Sep 2024 15:35:46 -0700
Subject: cxl: Move mailbox related bits to the same context

Create a new 'struct cxl_mailbox' and move all mailbox related bits to
it. This allows isolation of all CXL mailbox data in order to export
some of the calls to external kernel callers and avoid exporting of CXL
driver specific bits such has device states. The allocation of
'struct cxl_mailbox' is also split out with cxl_mailbox_init() so the
mailbox can be created independently.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Fan Ni <fan.ni@samsung.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Link: https://patch.msgid.link/20240905223711.1990186-3-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/mbox.c      | 57 +++++++++++++++++++++++---------
 drivers/cxl/core/memdev.c    | 18 ++++++----
 drivers/cxl/cxlmem.h         | 21 +++++-------
 drivers/cxl/pci.c            | 78 +++++++++++++++++++++++++++++---------------
 drivers/cxl/pmem.c           |  4 ++-
 include/cxl/mailbox.h        | 28 ++++++++++++++++
 tools/testing/cxl/test/mem.c | 44 ++++++++++++++++++-------
 7 files changed, 177 insertions(+), 73 deletions(-)
 create mode 100644 include/cxl/mailbox.h

(limited to 'tools')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 0a913b30c7fc..3d7ee6a12915 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -244,16 +244,17 @@ static const char *cxl_mem_opcode_to_name(u16 opcode)
 int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
 			  struct cxl_mbox_cmd *mbox_cmd)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	size_t out_size, min_out;
 	int rc;
 
-	if (mbox_cmd->size_in > mds->payload_size ||
-	    mbox_cmd->size_out > mds->payload_size)
+	if (mbox_cmd->size_in > cxl_mbox->payload_size ||
+	    mbox_cmd->size_out > cxl_mbox->payload_size)
 		return -E2BIG;
 
 	out_size = mbox_cmd->size_out;
 	min_out = mbox_cmd->min_out;
-	rc = mds->mbox_send(mds, mbox_cmd);
+	rc = cxl_mbox->mbox_send(cxl_mbox, mbox_cmd);
 	/*
 	 * EIO is reserved for a payload size mismatch and mbox_send()
 	 * may not return this error.
@@ -353,6 +354,7 @@ static int cxl_mbox_cmd_ctor(struct cxl_mbox_cmd *mbox,
 			     struct cxl_memdev_state *mds, u16 opcode,
 			     size_t in_size, size_t out_size, u64 in_payload)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	*mbox = (struct cxl_mbox_cmd) {
 		.opcode = opcode,
 		.size_in = in_size,
@@ -374,7 +376,7 @@ static int cxl_mbox_cmd_ctor(struct cxl_mbox_cmd *mbox,
 
 	/* Prepare to handle a full payload for variable sized output */
 	if (out_size == CXL_VARIABLE_PAYLOAD)
-		mbox->size_out = mds->payload_size;
+		mbox->size_out = cxl_mbox->payload_size;
 	else
 		mbox->size_out = out_size;
 
@@ -398,6 +400,8 @@ static int cxl_to_mem_cmd_raw(struct cxl_mem_command *mem_cmd,
 			      const struct cxl_send_command *send_cmd,
 			      struct cxl_memdev_state *mds)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+
 	if (send_cmd->raw.rsvd)
 		return -EINVAL;
 
@@ -406,7 +410,7 @@ static int cxl_to_mem_cmd_raw(struct cxl_mem_command *mem_cmd,
 	 * gets passed along without further checking, so it must be
 	 * validated here.
 	 */
-	if (send_cmd->out.size > mds->payload_size)
+	if (send_cmd->out.size > cxl_mbox->payload_size)
 		return -EINVAL;
 
 	if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode))
@@ -494,6 +498,7 @@ static int cxl_validate_cmd_from_user(struct cxl_mbox_cmd *mbox_cmd,
 				      struct cxl_memdev_state *mds,
 				      const struct cxl_send_command *send_cmd)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_mem_command mem_cmd;
 	int rc;
 
@@ -505,7 +510,7 @@ static int cxl_validate_cmd_from_user(struct cxl_mbox_cmd *mbox_cmd,
 	 * supports, but output can be arbitrarily large (simply write out as
 	 * much data as the hardware provides).
 	 */
-	if (send_cmd->in.size > mds->payload_size)
+	if (send_cmd->in.size > cxl_mbox->payload_size)
 		return -EINVAL;
 
 	/* Sanitize and construct a cxl_mem_command */
@@ -591,6 +596,7 @@ static int handle_mailbox_cmd_from_user(struct cxl_memdev_state *mds,
 					u64 out_payload, s32 *size_out,
 					u32 *retval)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct device *dev = mds->cxlds.dev;
 	int rc;
 
@@ -601,7 +607,7 @@ static int handle_mailbox_cmd_from_user(struct cxl_memdev_state *mds,
 		cxl_mem_opcode_to_name(mbox_cmd->opcode),
 		mbox_cmd->opcode, mbox_cmd->size_in);
 
-	rc = mds->mbox_send(mds, mbox_cmd);
+	rc = cxl_mbox->mbox_send(cxl_mbox, mbox_cmd);
 	if (rc)
 		goto out;
 
@@ -659,11 +665,12 @@ int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s)
 static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
 			u32 *size, u8 *out)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	u32 remaining = *size;
 	u32 offset = 0;
 
 	while (remaining) {
-		u32 xfer_size = min_t(u32, remaining, mds->payload_size);
+		u32 xfer_size = min_t(u32, remaining, cxl_mbox->payload_size);
 		struct cxl_mbox_cmd mbox_cmd;
 		struct cxl_mbox_get_log log;
 		int rc;
@@ -752,17 +759,18 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
 
 static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_memdev_state *mds)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_mbox_get_supported_logs *ret;
 	struct cxl_mbox_cmd mbox_cmd;
 	int rc;
 
-	ret = kvmalloc(mds->payload_size, GFP_KERNEL);
+	ret = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
 	if (!ret)
 		return ERR_PTR(-ENOMEM);
 
 	mbox_cmd = (struct cxl_mbox_cmd) {
 		.opcode = CXL_MBOX_OP_GET_SUPPORTED_LOGS,
-		.size_out = mds->payload_size,
+		.size_out = cxl_mbox->payload_size,
 		.payload_out = ret,
 		/* At least the record number field must be valid */
 		.min_out = 2,
@@ -910,6 +918,7 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
 				  enum cxl_event_log_type log,
 				  struct cxl_get_event_payload *get_pl)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_mbox_clear_event_payload *payload;
 	u16 total = le16_to_cpu(get_pl->record_count);
 	u8 max_handles = CXL_CLEAR_EVENT_MAX_HANDLES;
@@ -920,8 +929,8 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
 	int i;
 
 	/* Payload size may limit the max handles */
-	if (pl_size > mds->payload_size) {
-		max_handles = (mds->payload_size - sizeof(*payload)) /
+	if (pl_size > cxl_mbox->payload_size) {
+		max_handles = (cxl_mbox->payload_size - sizeof(*payload)) /
 			      sizeof(__le16);
 		pl_size = struct_size(payload, handles, max_handles);
 	}
@@ -979,6 +988,7 @@ free_pl:
 static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
 				    enum cxl_event_log_type type)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
 	struct device *dev = mds->cxlds.dev;
 	struct cxl_get_event_payload *payload;
@@ -995,7 +1005,7 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
 			.payload_in = &log_type,
 			.size_in = sizeof(log_type),
 			.payload_out = payload,
-			.size_out = mds->payload_size,
+			.size_out = cxl_mbox->payload_size,
 			.min_out = struct_size(payload, records, 0),
 		};
 
@@ -1327,6 +1337,7 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
 		       struct cxl_region *cxlr)
 {
 	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_mbox_poison_out *po;
 	struct cxl_mbox_poison_in pi;
 	int nr_records = 0;
@@ -1345,7 +1356,7 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
 			.opcode = CXL_MBOX_OP_GET_POISON,
 			.size_in = sizeof(pi),
 			.payload_in = &pi,
-			.size_out = mds->payload_size,
+			.size_out = cxl_mbox->payload_size,
 			.payload_out = po,
 			.min_out = struct_size(po, record, 0),
 		};
@@ -1381,7 +1392,9 @@ static void free_poison_buf(void *buf)
 /* Get Poison List output buffer is protected by mds->poison.lock */
 static int cxl_poison_alloc_buf(struct cxl_memdev_state *mds)
 {
-	mds->poison.list_out = kvmalloc(mds->payload_size, GFP_KERNEL);
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+
+	mds->poison.list_out = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
 	if (!mds->poison.list_out)
 		return -ENOMEM;
 
@@ -1407,6 +1420,19 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
 
+int cxl_mailbox_init(struct cxl_mailbox *cxl_mbox, struct device *host)
+{
+	if (!cxl_mbox || !host)
+		return -EINVAL;
+
+	cxl_mbox->host = host;
+	mutex_init(&cxl_mbox->mbox_mutex);
+	rcuwait_init(&cxl_mbox->mbox_wait);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, CXL);
+
 struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
 {
 	struct cxl_memdev_state *mds;
@@ -1417,7 +1443,6 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	mutex_init(&mds->mbox_mutex);
 	mutex_init(&mds->event.log_lock);
 	mds->cxlds.dev = dev;
 	mds->cxlds.reg_map.host = dev;
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 0277726afd04..05bb84cb1274 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -58,7 +58,7 @@ static ssize_t payload_max_show(struct device *dev,
 
 	if (!mds)
 		return sysfs_emit(buf, "\n");
-	return sysfs_emit(buf, "%zu\n", mds->payload_size);
+	return sysfs_emit(buf, "%zu\n", cxlds->cxl_mbox.payload_size);
 }
 static DEVICE_ATTR_RO(payload_max);
 
@@ -124,15 +124,16 @@ static ssize_t security_state_show(struct device *dev,
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
 	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
 	unsigned long state = mds->security.state;
 	int rc = 0;
 
 	/* sync with latest submission state */
-	mutex_lock(&mds->mbox_mutex);
+	mutex_lock(&cxl_mbox->mbox_mutex);
 	if (mds->security.sanitize_active)
 		rc = sysfs_emit(buf, "sanitize\n");
-	mutex_unlock(&mds->mbox_mutex);
+	mutex_unlock(&cxl_mbox->mbox_mutex);
 	if (rc)
 		return rc;
 
@@ -829,12 +830,13 @@ static enum fw_upload_err cxl_fw_prepare(struct fw_upload *fwl, const u8 *data,
 {
 	struct cxl_memdev_state *mds = fwl->dd_handle;
 	struct cxl_mbox_transfer_fw *transfer;
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 
 	if (!size)
 		return FW_UPLOAD_ERR_INVALID_SIZE;
 
 	mds->fw.oneshot = struct_size(transfer, data, size) <
-			    mds->payload_size;
+			    cxl_mbox->payload_size;
 
 	if (cxl_mem_get_fw_info(mds))
 		return FW_UPLOAD_ERR_HW_ERROR;
@@ -854,6 +856,7 @@ static enum fw_upload_err cxl_fw_write(struct fw_upload *fwl, const u8 *data,
 {
 	struct cxl_memdev_state *mds = fwl->dd_handle;
 	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
 	struct cxl_memdev *cxlmd = cxlds->cxlmd;
 	struct cxl_mbox_transfer_fw *transfer;
 	struct cxl_mbox_cmd mbox_cmd;
@@ -877,7 +880,7 @@ static enum fw_upload_err cxl_fw_write(struct fw_upload *fwl, const u8 *data,
 	 * sizeof(*transfer) is 128.  These constraints imply that @cur_size
 	 * will always be 128b aligned.
 	 */
-	cur_size = min_t(size_t, size, mds->payload_size - sizeof(*transfer));
+	cur_size = min_t(size_t, size, cxl_mbox->payload_size - sizeof(*transfer));
 
 	remaining = size - cur_size;
 	size_in = struct_size(transfer, data, cur_size);
@@ -1059,16 +1062,17 @@ EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, CXL);
 static void sanitize_teardown_notifier(void *data)
 {
 	struct cxl_memdev_state *mds = data;
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct kernfs_node *state;
 
 	/*
 	 * Prevent new irq triggered invocations of the workqueue and
 	 * flush inflight invocations.
 	 */
-	mutex_lock(&mds->mbox_mutex);
+	mutex_lock(&cxl_mbox->mbox_mutex);
 	state = mds->security.sanitize_node;
 	mds->security.sanitize_node = NULL;
-	mutex_unlock(&mds->mbox_mutex);
+	mutex_unlock(&cxl_mbox->mbox_mutex);
 
 	cancel_delayed_work_sync(&mds->security.poll_dwork);
 	sysfs_put(state);
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index a81a8982bf93..4ead415f8971 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -3,11 +3,12 @@
 #ifndef __CXL_MEM_H__
 #define __CXL_MEM_H__
 #include <uapi/linux/cxl_mem.h>
+#include <linux/pci.h>
 #include <linux/cdev.h>
 #include <linux/uuid.h>
-#include <linux/rcuwait.h>
 #include <linux/node.h>
 #include <cxl/event.h>
+#include <cxl/mailbox.h>
 #include "cxl.h"
 
 /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
@@ -424,6 +425,7 @@ struct cxl_dpa_perf {
  * @ram_res: Active Volatile memory capacity configuration
  * @serial: PCIe Device Serial Number
  * @type: Generic Memory Class device or Vendor Specific Memory device
+ * @cxl_mbox: CXL mailbox context
  */
 struct cxl_dev_state {
 	struct device *dev;
@@ -438,8 +440,14 @@ struct cxl_dev_state {
 	struct resource ram_res;
 	u64 serial;
 	enum cxl_devtype type;
+	struct cxl_mailbox cxl_mbox;
 };
 
+static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
+{
+	return dev_get_drvdata(cxl_mbox->host);
+}
+
 /**
  * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
  *
@@ -448,11 +456,8 @@ struct cxl_dev_state {
  * the functionality related to that like Identify Memory Device and Get
  * Partition Info
  * @cxlds: Core driver state common across Type-2 and Type-3 devices
- * @payload_size: Size of space for payload
- *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
  * @lsa_size: Size of Label Storage Area
  *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
- * @mbox_mutex: Mutex to synchronize mailbox access.
  * @firmware_version: Firmware version for the memory device.
  * @enabled_cmds: Hardware commands found enabled in CEL.
  * @exclusive_cmds: Commands that are kernel-internal only
@@ -470,17 +475,13 @@ struct cxl_dev_state {
  * @poison: poison driver state info
  * @security: security driver state info
  * @fw: firmware upload / activation state
- * @mbox_wait: RCU wait for mbox send completely
- * @mbox_send: @dev specific transport for transmitting mailbox commands
  *
  * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for
  * details on capacity parameters.
  */
 struct cxl_memdev_state {
 	struct cxl_dev_state cxlds;
-	size_t payload_size;
 	size_t lsa_size;
-	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
 	char firmware_version[0x10];
 	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
 	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
@@ -500,10 +501,6 @@ struct cxl_memdev_state {
 	struct cxl_poison_state poison;
 	struct cxl_security_state security;
 	struct cxl_fw_state fw;
-
-	struct rcuwait mbox_wait;
-	int (*mbox_send)(struct cxl_memdev_state *mds,
-			 struct cxl_mbox_cmd *cmd);
 };
 
 static inline struct cxl_memdev_state *
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 26e75499abdd..2fc7fd252c2b 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/io.h>
+#include <cxl/mailbox.h>
 #include "cxlmem.h"
 #include "cxlpci.h"
 #include "cxl.h"
@@ -124,6 +125,7 @@ static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
 	u16 opcode;
 	struct cxl_dev_id *dev_id = id;
 	struct cxl_dev_state *cxlds = dev_id->cxlds;
+	struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
 	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
 
 	if (!cxl_mbox_background_complete(cxlds))
@@ -132,13 +134,13 @@ static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
 	reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
 	opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg);
 	if (opcode == CXL_MBOX_OP_SANITIZE) {
-		mutex_lock(&mds->mbox_mutex);
+		mutex_lock(&cxl_mbox->mbox_mutex);
 		if (mds->security.sanitize_node)
 			mod_delayed_work(system_wq, &mds->security.poll_dwork, 0);
-		mutex_unlock(&mds->mbox_mutex);
+		mutex_unlock(&cxl_mbox->mbox_mutex);
 	} else {
 		/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
-		rcuwait_wake_up(&mds->mbox_wait);
+		rcuwait_wake_up(&cxl_mbox->mbox_wait);
 	}
 
 	return IRQ_HANDLED;
@@ -152,8 +154,9 @@ static void cxl_mbox_sanitize_work(struct work_struct *work)
 	struct cxl_memdev_state *mds =
 		container_of(work, typeof(*mds), security.poll_dwork.work);
 	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
 
-	mutex_lock(&mds->mbox_mutex);
+	mutex_lock(&cxl_mbox->mbox_mutex);
 	if (cxl_mbox_background_complete(cxlds)) {
 		mds->security.poll_tmo_secs = 0;
 		if (mds->security.sanitize_node)
@@ -167,12 +170,12 @@ static void cxl_mbox_sanitize_work(struct work_struct *work)
 		mds->security.poll_tmo_secs = min(15 * 60, timeout);
 		schedule_delayed_work(&mds->security.poll_dwork, timeout * HZ);
 	}
-	mutex_unlock(&mds->mbox_mutex);
+	mutex_unlock(&cxl_mbox->mbox_mutex);
 }
 
 /**
  * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
- * @mds: The memory device driver data
+ * @cxl_mbox: CXL mailbox context
  * @mbox_cmd: Command to send to the memory device.
  *
  * Context: Any context. Expects mbox_mutex to be held.
@@ -192,17 +195,18 @@ static void cxl_mbox_sanitize_work(struct work_struct *work)
  * not need to coordinate with each other. The driver only uses the primary
  * mailbox.
  */
-static int __cxl_pci_mbox_send_cmd(struct cxl_memdev_state *mds,
+static int __cxl_pci_mbox_send_cmd(struct cxl_mailbox *cxl_mbox,
 				   struct cxl_mbox_cmd *mbox_cmd)
 {
-	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct cxl_dev_state *cxlds = mbox_to_cxlds(cxl_mbox);
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
 	void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
 	struct device *dev = cxlds->dev;
 	u64 cmd_reg, status_reg;
 	size_t out_len;
 	int rc;
 
-	lockdep_assert_held(&mds->mbox_mutex);
+	lockdep_assert_held(&cxl_mbox->mbox_mutex);
 
 	/*
 	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
@@ -315,10 +319,10 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_memdev_state *mds,
 
 		timeout = mbox_cmd->poll_interval_ms;
 		for (i = 0; i < mbox_cmd->poll_count; i++) {
-			if (rcuwait_wait_event_timeout(&mds->mbox_wait,
-				       cxl_mbox_background_complete(cxlds),
-				       TASK_UNINTERRUPTIBLE,
-				       msecs_to_jiffies(timeout)) > 0)
+			if (rcuwait_wait_event_timeout(&cxl_mbox->mbox_wait,
+						       cxl_mbox_background_complete(cxlds),
+						       TASK_UNINTERRUPTIBLE,
+						       msecs_to_jiffies(timeout)) > 0)
 				break;
 		}
 
@@ -360,7 +364,7 @@ success:
 		 */
 		size_t n;
 
-		n = min3(mbox_cmd->size_out, mds->payload_size, out_len);
+		n = min3(mbox_cmd->size_out, cxl_mbox->payload_size, out_len);
 		memcpy_fromio(mbox_cmd->payload_out, payload, n);
 		mbox_cmd->size_out = n;
 	} else {
@@ -370,14 +374,14 @@ success:
 	return 0;
 }
 
-static int cxl_pci_mbox_send(struct cxl_memdev_state *mds,
+static int cxl_pci_mbox_send(struct cxl_mailbox *cxl_mbox,
 			     struct cxl_mbox_cmd *cmd)
 {
 	int rc;
 
-	mutex_lock_io(&mds->mbox_mutex);
-	rc = __cxl_pci_mbox_send_cmd(mds, cmd);
-	mutex_unlock(&mds->mbox_mutex);
+	mutex_lock_io(&cxl_mbox->mbox_mutex);
+	rc = __cxl_pci_mbox_send_cmd(cxl_mbox, cmd);
+	mutex_unlock(&cxl_mbox->mbox_mutex);
 
 	return rc;
 }
@@ -385,6 +389,7 @@ static int cxl_pci_mbox_send(struct cxl_memdev_state *mds,
 static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail)
 {
 	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
 	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
 	struct device *dev = cxlds->dev;
 	unsigned long timeout;
@@ -417,8 +422,8 @@ static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail)
 		return -ETIMEDOUT;
 	}
 
-	mds->mbox_send = cxl_pci_mbox_send;
-	mds->payload_size =
+	cxl_mbox->mbox_send = cxl_pci_mbox_send;
+	cxl_mbox->payload_size =
 		1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
 
 	/*
@@ -428,16 +433,15 @@ static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail)
 	 * there's no point in going forward. If the size is too large, there's
 	 * no harm is soft limiting it.
 	 */
-	mds->payload_size = min_t(size_t, mds->payload_size, SZ_1M);
-	if (mds->payload_size < 256) {
+	cxl_mbox->payload_size = min_t(size_t, cxl_mbox->payload_size, SZ_1M);
+	if (cxl_mbox->payload_size < 256) {
 		dev_err(dev, "Mailbox is too small (%zub)",
-			mds->payload_size);
+			cxl_mbox->payload_size);
 		return -ENXIO;
 	}
 
-	dev_dbg(dev, "Mailbox payload sized %zu", mds->payload_size);
+	dev_dbg(dev, "Mailbox payload sized %zu", cxl_mbox->payload_size);
 
-	rcuwait_init(&mds->mbox_wait);
 	INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work);
 
 	/* background command interrupts are optional */
@@ -575,9 +579,10 @@ static void free_event_buf(void *buf)
  */
 static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_get_event_payload *buf;
 
-	buf = kvmalloc(mds->payload_size, GFP_KERNEL);
+	buf = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 	mds->event.buf = buf;
@@ -783,6 +788,23 @@ static int cxl_event_config(struct pci_host_bridge *host_bridge,
 	return 0;
 }
 
+static int cxl_pci_type3_init_mailbox(struct cxl_dev_state *cxlds)
+{
+	int rc;
+
+	/*
+	 * Fail the init if there's no mailbox. For a type3 this is out of spec.
+	 */
+	if (!cxlds->reg_map.device_map.mbox.valid)
+		return -ENODEV;
+
+	rc = cxl_mailbox_init(&cxlds->cxl_mbox, cxlds->dev);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
@@ -843,6 +865,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		dev_dbg(&pdev->dev, "Failed to map RAS capability.\n");
 
+	rc = cxl_pci_type3_init_mailbox(cxlds);
+	if (rc)
+		return rc;
+
 	rc = cxl_await_media_ready(cxlds);
 	if (rc == 0)
 		cxlds->media_ready = true;
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index 647c7e25ef3a..50c6d25ede89 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -102,13 +102,15 @@ static int cxl_pmem_get_config_size(struct cxl_memdev_state *mds,
 				    struct nd_cmd_get_config_size *cmd,
 				    unsigned int buf_len)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+
 	if (sizeof(*cmd) > buf_len)
 		return -EINVAL;
 
 	*cmd = (struct nd_cmd_get_config_size){
 		.config_size = mds->lsa_size,
 		.max_xfer =
-			mds->payload_size - sizeof(struct cxl_mbox_set_lsa),
+			cxl_mbox->payload_size - sizeof(struct cxl_mbox_set_lsa),
 	};
 
 	return 0;
diff --git a/include/cxl/mailbox.h b/include/cxl/mailbox.h
new file mode 100644
index 000000000000..bacd111e75f1
--- /dev/null
+++ b/include/cxl/mailbox.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2024 Intel Corporation. */
+#ifndef __CXL_MBOX_H__
+#define __CXL_MBOX_H__
+#include <linux/rcuwait.h>
+
+struct cxl_mbox_cmd;
+
+/**
+ * struct cxl_mailbox - context for CXL mailbox operations
+ * @host: device that hosts the mailbox
+ * @payload_size: Size of space for payload
+ *                (CXL 3.1 8.2.8.4.3 Mailbox Capabilities Register)
+ * @mbox_mutex: mutex protects device mailbox and firmware
+ * @mbox_wait: rcuwait for mailbox
+ * @mbox_send: @dev specific transport for transmitting mailbox commands
+ */
+struct cxl_mailbox {
+	struct device *host;
+	size_t payload_size;
+	struct mutex mbox_mutex; /* lock to protect mailbox context */
+	struct rcuwait mbox_wait;
+	int (*mbox_send)(struct cxl_mailbox *cxl_mbox, struct cxl_mbox_cmd *cmd);
+};
+
+int cxl_mailbox_init(struct cxl_mailbox *cxl_mbox, struct device *host);
+
+#endif
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 129f179b0ac5..ccdd6a504222 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -8,6 +8,7 @@
 #include <linux/delay.h>
 #include <linux/sizes.h>
 #include <linux/bits.h>
+#include <cxl/mailbox.h>
 #include <asm/unaligned.h>
 #include <crypto/sha2.h>
 #include <cxlmem.h>
@@ -534,6 +535,7 @@ static int mock_gsl(struct cxl_mbox_cmd *cmd)
 
 static int mock_get_log(struct cxl_memdev_state *mds, struct cxl_mbox_cmd *cmd)
 {
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_mbox_get_log *gl = cmd->payload_in;
 	u32 offset = le32_to_cpu(gl->offset);
 	u32 length = le32_to_cpu(gl->length);
@@ -542,7 +544,7 @@ static int mock_get_log(struct cxl_memdev_state *mds, struct cxl_mbox_cmd *cmd)
 
 	if (cmd->size_in < sizeof(*gl))
 		return -EINVAL;
-	if (length > mds->payload_size)
+	if (length > cxl_mbox->payload_size)
 		return -EINVAL;
 	if (offset + length > sizeof(mock_cel))
 		return -EINVAL;
@@ -617,12 +619,13 @@ void cxl_mockmem_sanitize_work(struct work_struct *work)
 {
 	struct cxl_memdev_state *mds =
 		container_of(work, typeof(*mds), security.poll_dwork.work);
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 
-	mutex_lock(&mds->mbox_mutex);
+	mutex_lock(&cxl_mbox->mbox_mutex);
 	if (mds->security.sanitize_node)
 		sysfs_notify_dirent(mds->security.sanitize_node);
 	mds->security.sanitize_active = false;
-	mutex_unlock(&mds->mbox_mutex);
+	mutex_unlock(&cxl_mbox->mbox_mutex);
 
 	dev_dbg(mds->cxlds.dev, "sanitize complete\n");
 }
@@ -631,6 +634,7 @@ static int mock_sanitize(struct cxl_mockmem_data *mdata,
 			 struct cxl_mbox_cmd *cmd)
 {
 	struct cxl_memdev_state *mds = mdata->mds;
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	int rc = 0;
 
 	if (cmd->size_in != 0)
@@ -648,14 +652,14 @@ static int mock_sanitize(struct cxl_mockmem_data *mdata,
 		return -ENXIO;
 	}
 
-	mutex_lock(&mds->mbox_mutex);
+	mutex_lock(&cxl_mbox->mbox_mutex);
 	if (schedule_delayed_work(&mds->security.poll_dwork,
 				  msecs_to_jiffies(mdata->sanitize_timeout))) {
 		mds->security.sanitize_active = true;
 		dev_dbg(mds->cxlds.dev, "sanitize issued\n");
 	} else
 		rc = -EBUSY;
-	mutex_unlock(&mds->mbox_mutex);
+	mutex_unlock(&cxl_mbox->mbox_mutex);
 
 	return rc;
 }
@@ -1333,12 +1337,13 @@ static int mock_activate_fw(struct cxl_mockmem_data *mdata,
 	return -EINVAL;
 }
 
-static int cxl_mock_mbox_send(struct cxl_memdev_state *mds,
+static int cxl_mock_mbox_send(struct cxl_mailbox *cxl_mbox,
 			      struct cxl_mbox_cmd *cmd)
 {
-	struct cxl_dev_state *cxlds = &mds->cxlds;
-	struct device *dev = cxlds->dev;
+	struct device *dev = cxl_mbox->host;
 	struct cxl_mockmem_data *mdata = dev_get_drvdata(dev);
+	struct cxl_memdev_state *mds = mdata->mds;
+	struct cxl_dev_state *cxlds = &mds->cxlds;
 	int rc = -EIO;
 
 	switch (cmd->opcode) {
@@ -1453,6 +1458,17 @@ static ssize_t event_trigger_store(struct device *dev,
 }
 static DEVICE_ATTR_WO(event_trigger);
 
+static int cxl_mock_mailbox_create(struct cxl_dev_state *cxlds)
+{
+	int rc;
+
+	rc = cxl_mailbox_init(&cxlds->cxl_mbox, cxlds->dev);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
 static int cxl_mock_mem_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -1460,6 +1476,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 	struct cxl_memdev_state *mds;
 	struct cxl_dev_state *cxlds;
 	struct cxl_mockmem_data *mdata;
+	struct cxl_mailbox *cxl_mbox;
 	int rc;
 
 	mdata = devm_kzalloc(dev, sizeof(*mdata), GFP_KERNEL);
@@ -1487,13 +1504,18 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 	if (IS_ERR(mds))
 		return PTR_ERR(mds);
 
+	cxlds = &mds->cxlds;
+	rc = cxl_mock_mailbox_create(cxlds);
+	if (rc)
+		return rc;
+
+	cxl_mbox = &mds->cxlds.cxl_mbox;
 	mdata->mds = mds;
-	mds->mbox_send = cxl_mock_mbox_send;
-	mds->payload_size = SZ_4K;
+	cxl_mbox->mbox_send = cxl_mock_mbox_send;
+	cxl_mbox->payload_size = SZ_4K;
 	mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf;
 	INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work);
 
-	cxlds = &mds->cxlds;
 	cxlds->serial = pdev->id;
 	if (is_rcd(pdev))
 		cxlds->rcd = true;
-- 
cgit v1.2.3


From e4835f1da425fbc75e37ce8258c9927170de5bfe Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Thu, 16 May 2024 19:40:53 +0000
Subject: kunit: tool: Build compile_commands.json

compile_commands.json is used by clangd[1] to provide code navigation
and completion functionality to editors. See [2] for an example
configuration that includes this functionality for VSCode.

It can currently be built manually when using kunit.py, by running:

  ./scripts/clang-tools/gen_compile_commands.py -d .kunit

With this change however, it's built automatically so you don't need to
manually keep it up to date.

Unlike the manual approach, having make build the compile_commands.json
means that it appears in the build output tree instead of at the root of
the source tree, so you'll need to add --compile-commands-dir=.kunit to
your clangd args for it to be found. This might turn out to be pretty
annoying, I'm not sure yet. If so maybe we can later add some hackery to
kunit.py to work around it.

[1] https://clangd.llvm.org/
[2] https://github.com/FlorentRevest/linux-kernel-vscode

Signed-off-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_kernel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 7254c110ff23..61931c4926fd 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -72,7 +72,8 @@ class LinuxSourceTreeOperations:
 			raise ConfigError(e.output.decode())
 
 	def make(self, jobs: int, build_dir: str, make_options: Optional[List[str]]) -> None:
-		command = ['make', 'ARCH=' + self._linux_arch, 'O=' + build_dir, '--jobs=' + str(jobs)]
+		command = ['make', 'all', 'compile_commands.json', 'ARCH=' + self._linux_arch,
+			   'O=' + build_dir, '--jobs=' + str(jobs)]
 		if make_options:
 			command.extend(make_options)
 		if self._cross_compile:
-- 
cgit v1.2.3


From cded7e0479c9340d7859841f2c0fd1aabf8f6294 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 11 Sep 2024 20:52:40 -0400
Subject: selftests/net: packetdrill: run in netns and expand config

Run packetdrill tests inside netns.
They may change system settings, such as sysctl.

Also expand config with a few more needed CONFIGs.

Link: https://lore.kernel.org/netdev/20240910152640.429920be@kernel.org/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240912005317.1253001-2-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/packetdrill/config         | 5 +++++
 tools/testing/selftests/net/packetdrill/ksft_runner.sh | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/config b/tools/testing/selftests/net/packetdrill/config
index 0d402830f18d..a7877819081f 100644
--- a/tools/testing/selftests/net/packetdrill/config
+++ b/tools/testing/selftests/net/packetdrill/config
@@ -1,5 +1,10 @@
 CONFIG_IPV6=y
+CONFIG_HZ_1000=y
+CONFIG_HZ=1000
+CONFIG_NET_NS=y
 CONFIG_NET_SCH_FIFO=y
 CONFIG_PROC_SYSCTL=y
+CONFIG_SYN_COOKIES=y
+CONFIG_TCP_CONG_CUBIC=y
 CONFIG_TCP_MD5SIG=y
 CONFIG_TUN=y
diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
index 2f62caccbbbc..7478c0c0c9aa 100755
--- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -33,9 +33,9 @@ fi
 ktap_print_header
 ktap_set_plan 2
 
-packetdrill ${ipv4_args[@]} $(basename $script) > /dev/null \
+unshare -n packetdrill ${ipv4_args[@]} $(basename $script) > /dev/null \
 	&& ktap_test_pass "ipv4" || ktap_test_fail "ipv4"
-packetdrill ${ipv6_args[@]} $(basename $script) > /dev/null \
+unshare -n packetdrill ${ipv6_args[@]} $(basename $script) > /dev/null \
 	&& ktap_test_pass "ipv6" || ktap_test_fail "ipv6"
 
 ktap_finished
-- 
cgit v1.2.3


From 1e42f73fd3c26e567c910e2ab9c91e9f56eae315 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 11 Sep 2024 20:52:41 -0400
Subject: selftests/net: packetdrill: import tcp/zerocopy

Same as initial tests, import verbatim from
github.com/google/packetdrill, aside from:

- update `source ./defaults.sh` path to adjust for flat dir
- add SPDX headers
- remove author statements if any
- drop blank lines at EOF (new)

Also import set_sysctls.py, which many scripts depend on to set
sysctls and then restore them later. This is no longer strictly needed
for namespacified sysctl. But not all sysctls are namespacified, and
doesn't hurt if they are.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240912005317.1253001-3-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/packetdrill/Makefile   |   1 +
 .../selftests/net/packetdrill/set_sysctls.py       |  38 +++++++
 .../net/packetdrill/tcp_zerocopy_basic.pkt         |  55 ++++++++++
 .../net/packetdrill/tcp_zerocopy_batch.pkt         |  41 +++++++
 .../net/packetdrill/tcp_zerocopy_client.pkt        |  30 ++++++
 .../net/packetdrill/tcp_zerocopy_closed.pkt        |  44 ++++++++
 .../net/packetdrill/tcp_zerocopy_epoll_edge.pkt    |  61 +++++++++++
 .../packetdrill/tcp_zerocopy_epoll_exclusive.pkt   |  63 +++++++++++
 .../net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt |  66 ++++++++++++
 .../packetdrill/tcp_zerocopy_fastopen-client.pkt   |  56 ++++++++++
 .../packetdrill/tcp_zerocopy_fastopen-server.pkt   |  44 ++++++++
 .../net/packetdrill/tcp_zerocopy_maxfrags.pkt      | 118 +++++++++++++++++++++
 .../net/packetdrill/tcp_zerocopy_small.pkt         |  57 ++++++++++
 13 files changed, 674 insertions(+)
 create mode 100755 tools/testing/selftests/net/packetdrill/set_sysctls.py
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/Makefile b/tools/testing/selftests/net/packetdrill/Makefile
index 870f7258dc8d..31cfb666ba8b 100644
--- a/tools/testing/selftests/net/packetdrill/Makefile
+++ b/tools/testing/selftests/net/packetdrill/Makefile
@@ -2,6 +2,7 @@
 
 TEST_INCLUDES := ksft_runner.sh \
 		 defaults.sh \
+		 set_sysctls.py \
 		 ../../kselftest/ktap_helpers.sh
 
 TEST_PROGS := $(wildcard *.pkt)
diff --git a/tools/testing/selftests/net/packetdrill/set_sysctls.py b/tools/testing/selftests/net/packetdrill/set_sysctls.py
new file mode 100755
index 000000000000..5ddf456ae973
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/set_sysctls.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""Sets sysctl values and writes a file that restores them.
+
+The arguments are of the form "<proc-file>=<val>" separated by spaces.
+The program first reads the current value of the proc-file and creates
+a shell script named "/tmp/sysctl_restore_${PACKETDRILL_PID}.sh" which
+restores the values when executed. It then sets the new values.
+
+PACKETDRILL_PID is set by packetdrill to the pid of itself, so a .pkt
+file could restore sysctls by running `/tmp/sysctl_restore_${PPID}.sh`
+at the end.
+"""
+
+import os
+import subprocess
+import sys
+
+filename = '/tmp/sysctl_restore_%s.sh' % os.environ['PACKETDRILL_PID']
+
+# Open file for restoring sysctl values
+restore_file = open(filename, 'w')
+print('#!/bin/bash', file=restore_file)
+
+for a in sys.argv[1:]:
+  sysctl = a.split('=')
+  # sysctl[0] contains the proc-file name, sysctl[1] the new value
+
+  # read current value and add restore command to file
+  cur_val = subprocess.check_output(['cat', sysctl[0]], universal_newlines=True)
+  print('echo "%s" > %s' % (cur_val.strip(), sysctl[0]), file=restore_file)
+
+  # set new value
+  cmd = 'echo "%s" > %s' % (sysctl[1], sysctl[0])
+  os.system(cmd)
+
+os.system('chmod u+x %s' % filename)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt
new file mode 100644
index 000000000000..a82c8899d36b
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+// basic zerocopy test:
+//
+// send a packet with MSG_ZEROCOPY and receive the notification ID
+// repeat and verify IDs are consecutive
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=1,
+                                    ee_data=1}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt
new file mode 100644
index 000000000000..c01915e7f4a1
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+// batch zerocopy test:
+//
+// send multiple packets, then read one range of all notifications.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_MARK, [666], 4) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=1}}
+                  ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt
new file mode 100644
index 000000000000..6509882932e9
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Minimal client-side zerocopy test
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0...0 connect(4, ..., ...) = 0
+
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 8>
+   +0 < S. 0:0(0) ack 1 win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > . 1:1(0) ack 1
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt
new file mode 100644
index 000000000000..2cd78755cb2a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+// send with MSG_ZEROCOPY on a non-established socket
+//
+// verify that a send in state TCP_CLOSE correctly aborts the zerocopy
+// operation, specifically it does not increment the zerocopy counter.
+//
+// First send on a closed socket and wait for (absent) notification.
+// Then connect and send and verify that notification nr. is zero.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = -1 EPIPE (Broken pipe)
+
+   +0.1 recvmsg(4, {msg_name(...)=...,
+                    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE,
+                    msg_control=[]}, MSG_ERRQUEUE) = -1 EAGAIN (Resource temporarily unavailable)
+
+   +0...0 connect(4, ..., ...) = 0
+
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 8>
+   +0 < S. 0:0(0) ack 1 win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > . 1:1(0) ack 1
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt
new file mode 100644
index 000000000000..7671c20e01cf
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+// epoll zerocopy test:
+//
+// EPOLLERR is known to be not edge-triggered unlike EPOLLIN and EPOLLOUT but
+// it is not level-triggered either.
+//
+// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR
+// is correctly fired only once, when EPOLLET is set. send another packet with
+// MSG_ZEROCOPY. confirm that EPOLLERR is correctly fired again only once.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 epoll_create(1) = 5
+   +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, {events=EPOLLOUT|EPOLLET, fd=4}) = 0
+   +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+// receive only one EPOLLERR for the two sends above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 8001:12001(4000) ack 1
+   +0 < . 1:1(0) ack 12001 win 257
+
+// receive only one EPOLLERR for the third send above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=2}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt
new file mode 100644
index 000000000000..fadc480fdb7f
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+// epoll zerocopy test:
+//
+// EPOLLERR is known to be not edge-triggered unlike EPOLLIN and EPOLLOUT but
+// it is not level-triggered either. this tests verify that the same behavior is
+// maintained when we have EPOLLEXCLUSIVE.
+//
+// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR
+// is correctly fired only once, when EPOLLET is set. send another packet with
+// MSG_ZEROCOPY. confirm that EPOLLERR is correctly fired again only once.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 epoll_create(1) = 5
+   +0 epoll_ctl(5, EPOLL_CTL_ADD, 4,
+		{events=EPOLLOUT|EPOLLET|EPOLLEXCLUSIVE, fd=4}) = 0
+   +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+// receive only one EPOLLERR for the two sends above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 8001:12001(4000) ack 1
+   +0 < . 1:1(0) ack 12001 win 257
+
+// receive only one EPOLLERR for the third send above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=2}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt
new file mode 100644
index 000000000000..5bfa0d1d2f4a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// epoll zerocopy test:
+//
+// This is a test to confirm that EPOLLERR is only fired once for an FD when
+// EPOLLONESHOT is set.
+//
+// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR
+// is correctly fired only once, when EPOLLONESHOT is set. send another packet
+// with MSG_ZEROCOPY. confirm that EPOLLERR is not fired. Rearm the FD and
+// confirm that EPOLLERR is correctly set.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 epoll_create(1) = 5
+   +0 epoll_ctl(5, EPOLL_CTL_ADD, 4,
+		{events=EPOLLOUT|EPOLLET|EPOLLONESHOT, fd=4}) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+// receive only one EPOLLERR for the two sends above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 8001:12001(4000) ack 1
+   +0 < . 1:1(0) ack 12001 win 257
+
+// receive no EPOLLERR for the third send above.
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+// rearm the FD and verify the EPOLLERR is fired again.
+   +0 epoll_ctl(5, EPOLL_CTL_MOD, 4, {events=EPOLLOUT|EPOLLONESHOT, fd=4}) = 0
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=2}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt
new file mode 100644
index 000000000000..4a73bbf46961
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+// Fastopen client zerocopy test:
+//
+// send data with MSG_FASTOPEN | MSG_ZEROCOPY and verify that the
+// kernel returns the notification ID.
+//
+// Fastopen requires a stored cookie. Create two sockets. The first
+// one will have no data in the initial send. On return 0 the
+// zerocopy notification counter is not incremented. Verify this too.
+
+`./defaults.sh`
+
+// Send a FastOpen request, no cookie yet so no data in SYN
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 sendto(3, ..., 500, MSG_FASTOPEN|MSG_ZEROCOPY, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 1000 ecr 0,nop,wscale 8,FO,nop,nop>
+ +.01 < S. 123:123(0) ack 1 win 14600 <mss 940,TS val 2000 ecr 1000,sackOK,nop,wscale 6, FO abcd1234,nop,nop>
+   +0 > . 1:1(0) ack 1 <nop,nop,TS val 1001 ecr 2000>
+
+// Read from error queue: no zerocopy notification
+   +1 recvmsg(3, {msg_name(...)=...,
+                    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE,
+                    msg_control=[]}, MSG_ERRQUEUE) = -1 EAGAIN (Resource temporarily unavailable)
+
+ +.01 close(3) = 0
+   +0 > F. 1:1(0) ack 1 <nop,nop,TS val 1002 ecr 2000>
+ +.01 < F. 1:1(0) ack 2 win 92 <nop,nop,TS val 2001 ecr 1002>
+   +0 > .  2:2(0) ack 2 <nop,nop,TS val 1003 ecr 2001>
+
+// Send another Fastopen request, now SYN will have data
+ +.07 `sysctl -q net.ipv4.tcp_timestamps=0`
+  +.1 socket(..., SOCK_STREAM, IPPROTO_TCP) = 5
+   +0 fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 setsockopt(5, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 sendto(5, ..., 500, MSG_FASTOPEN|MSG_ZEROCOPY, ..., ...) = 500
+   +0 > S 0:500(500) <mss 1460,nop,nop,sackOK,nop,wscale 8,FO abcd1234,nop,nop>
+ +.05 < S. 5678:5678(0) ack 501 win 14600 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+   +0 > . 501:501(0) ack 1
+
+// Read from error queue: now has first zerocopy notification
+   +0.5 recvmsg(5, {msg_name(...)=...,
+                    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE,
+                    msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt
new file mode 100644
index 000000000000..36086c5877ce
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+// Fastopen server zerocopy test:
+//
+// send data with MSG_FASTOPEN | MSG_ZEROCOPY and verify that the
+// kernel returns the notification ID.
+
+`./defaults.sh
+ ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x207`
+
+// Set up a TFO server listening socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+  +.1 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+   +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN, [2], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+
+// Client sends a SYN with data.
+  +.1 < S 0:1000(1000) win 32792 <mss 1460,sackOK,nop,nop>
+   +0 > S. 0:0(0) ack 1001 <mss 1460,nop,nop,sackOK>
+
+// Server accepts and replies with data.
++.005 accept(3, ..., ...) = 4
+   +0 read(4, ..., 1024) = 1000
+   +0 sendto(4, ..., 1000, MSG_ZEROCOPY, ..., ...) = 1000
+   +0 > P. 1:1001(1000) ack 1001
+ +.05 < . 1001:1001(0) ack 1001 win 32792
+
+// Read from error queue: now has first zerocopy notification
+  +0.1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                      {cmsg_level=CMSG_LEVEL_IP,
+                       cmsg_type=CMSG_TYPE_RECVERR,
+                       cmsg_data={ee_errno=0,
+                                  ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                  ee_type=0,
+                                  ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                  ee_info=0,
+                                  ee_data=0}}
+                  ]}, MSG_ERRQUEUE) = 0
+
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt
new file mode 100644
index 000000000000..672f817faca0
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+// tcp_MAX_SKB_FRAGS test
+//
+// Verify that sending an iovec of tcp_MAX_SKB_FRAGS + 1 elements will
+// 1) fit in a single packet without zerocopy
+// 2) spill over into a second packet with zerocopy,
+//    because each iovec element becomes a frag
+// 3) the PSH bit is set on an skb when it runs out of fragments
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+
+   // Each pinned zerocopy page is fully accounted to skb->truesize.
+   // This test generates a worst case packet with each frag storing
+   // one byte, but increasing truesize with a page (64KB on PPC).
+   +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [2000000], 4) = 0
+
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   // send an iov of 18 elements: just becomes a linear skb
+   +0 sendmsg(4, {msg_name(...)=...,
+		  msg_iov(18)=[{..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}],
+		  msg_flags=0}, 0) = 18
+
+   +0 > P. 1:19(18) ack 1
+   +0 < . 1:1(0) ack 19 win 257
+
+   // send a zerocopy iov of 18 elements:
+   +1 sendmsg(4, {msg_name(...)=...,
+		  msg_iov(18)=[{..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}],
+		  msg_flags=0}, MSG_ZEROCOPY) = 18
+
+   // verify that it is split in one skb of 17 frags + 1 of 1 frag
+   // verify that both have the PSH bit set
+   +0 > P. 19:36(17) ack 1
+   +0 < . 1:1(0) ack 36 win 257
+
+   +0 > P. 36:37(1) ack 1
+   +0 < . 1:1(0) ack 37 win 257
+
+   +1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
+
+   // send a zerocopy iov of 64 elements:
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(64)=[{..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1}],
+                  msg_flags=0}, MSG_ZEROCOPY) = 64
+
+   // verify that it is split in skbs with 17 frags
+   +0 > P. 37:54(17) ack 1
+   +0 < . 1:1(0) ack 54 win 257
+
+   +0 > P. 54:71(17) ack 1
+   +0 < . 1:1(0) ack 71 win 257
+
+   +0 > P. 71:88(17) ack 1
+   +0 < . 1:1(0) ack 88 win 257
+
+   +0 > P. 88:101(13) ack 1
+   +0 < . 1:1(0) ack 101 win 257
+
+   +1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=1,
+                                    ee_data=1}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt
new file mode 100644
index 000000000000..a9a1ac0aea4f
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+// small packet zerocopy test:
+//
+// verify that SO_EE_CODE_ZEROCOPY_COPIED is set on zerocopy
+// packets of all sizes, including the smallest payload, 1B.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   // send 1B
+   +0 send(4, ..., 1, MSG_ZEROCOPY) = 1
+   +0 > P. 1:2(1) ack 1
+   +0 < . 1:1(0) ack 2 win 257
+
+   +1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
+
+   // send 1B again
+   +0 send(4, ..., 1, MSG_ZEROCOPY) = 1
+   +0 > P. 2:3(1) ack 1
+   +0 < . 1:1(0) ack 3 win 257
+
+   +1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=1,
+                                    ee_data=1}}
+                   ]}, MSG_ERRQUEUE) = 0
-- 
cgit v1.2.3


From e874be276ee4f22de2bdf3c3a876a41a71c3207f Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 11 Sep 2024 20:52:42 -0400
Subject: selftests/net: packetdrill: import tcp/slow_start

Same import process as previous tests.

Also add CONFIG_NET_SCH_FQ to config, as one test uses that.

Same test process as previous tests. Both with and without debug mode.
Recording the steps once:

make mrproper
vng --build \
        --config tools/testing/selftests/net/packetdrill/config \
        --config kernel/configs/debug.config
vng -v --run . --user root --cpus 4 -- \
	make -C tools/testing/selftests TARGETS=net/packetdrill run_tests

Link: https://github.com/linux-netdev/nipa/wiki/How-to-run-netdev-selftests-CI-style#how-to-build
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240912005317.1253001-4-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/packetdrill/config     |  1 +
 .../tcp_slow_start_slow-start-ack-per-1pkt.pkt     | 56 +++++++++++++++++++
 ...low_start_slow-start-ack-per-2pkt-send-5pkt.pkt | 33 ++++++++++++
 ...low_start_slow-start-ack-per-2pkt-send-6pkt.pkt | 34 ++++++++++++
 .../tcp_slow_start_slow-start-ack-per-2pkt.pkt     | 42 +++++++++++++++
 .../tcp_slow_start_slow-start-ack-per-4pkt.pkt     | 35 ++++++++++++
 .../tcp_slow_start_slow-start-after-idle.pkt       | 39 ++++++++++++++
 .../tcp_slow_start_slow-start-after-win-update.pkt | 50 +++++++++++++++++
 ..._start_slow-start-app-limited-9-packets-out.pkt | 38 +++++++++++++
 .../tcp_slow_start_slow-start-app-limited.pkt      | 36 +++++++++++++
 .../tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt  | 63 ++++++++++++++++++++++
 11 files changed, 427 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/config b/tools/testing/selftests/net/packetdrill/config
index a7877819081f..0237ed98f3c0 100644
--- a/tools/testing/selftests/net/packetdrill/config
+++ b/tools/testing/selftests/net/packetdrill/config
@@ -3,6 +3,7 @@ CONFIG_HZ_1000=y
 CONFIG_HZ=1000
 CONFIG_NET_NS=y
 CONFIG_NET_SCH_FIFO=y
+CONFIG_NET_SCH_FQ=y
 CONFIG_PROC_SYSCTL=y
 CONFIG_SYN_COOKIES=y
 CONFIG_TCP_CONG_CUBIC=y
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt
new file mode 100644
index 000000000000..795c476d222d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when not application-limited, so that
+// the cwnd continues to grow.
+// In this variant, the receiver ACKs every packet.
+
+// Set up config. To keep things simple, disable the
+// mechanism that defers sending in order to send bigger TSO packets.
+`./defaults.sh
+sysctl -q net.ipv4.tcp_tso_win_divisor=100`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 30000) = 30000
+   +0 > P. 1:10001(10000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 1001 win 257
+   +0 > P. 10001:12001(2000) ack 1
+
+   +0 < . 1:1(0) ack 2001 win 257
+   +0 > P. 12001:14001(2000) ack 1
+
++.005 < . 1:1(0) ack 3001 win 257
+   +0 > P. 14001:16001(2000) ack 1
+
+   +0 < . 1:1(0) ack 4001 win 257
+   +0 > P. 16001:18001(2000) ack 1
+
++.005 < . 1:1(0) ack 5001 win 257
+   +0 > P. 18001:20001(2000) ack 1
+
+   +0 < . 1:1(0) ack 6001 win 257
+   +0 > P. 20001:22001(2000) ack 1
+
++.005 < . 1:1(0) ack 7001 win 257
+   +0 > P. 22001:24001(2000) ack 1
+
+   +0 < . 1:1(0) ack 8001 win 257
+   +0 > P. 24001:26001(2000) ack 1
+
++.005 < . 1:1(0) ack 9001 win 257
+   +0 > P. 26001:28001(2000) ack 1
+
+   +0 < . 1:1(0) ack 10001 win 257
+   +0 > P. 28001:30001(2000) ack 1
+
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt
new file mode 100644
index 000000000000..9212ae1fd0f2
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when an outstanding flight of packets is
+// less than the current cwnd, and not big enough to bump up cwnd.
+//
+// In this variant, the receiver ACKs every other packet,
+// approximating standard delayed ACKs.
+
+// Set up config.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+
+// Only send 5 packets.
+   +0 write(4, ..., 5000) = 5000
+   +0 > P. 1:5001(5000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 2001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 4001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 5001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt
new file mode 100644
index 000000000000..416c901ddf51
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when an outstanding flight of packets is
+// less than the current cwnd, but still big enough that in slow
+// start we want to increase our cwnd a little.
+//
+// In this variant, the receiver ACKs every other packet,
+// approximating standard delayed ACKs.
+
+// Set up config.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+
+// Only send 6 packets.
+   +0 write(4, ..., 6000) = 6000
+   +0 > P. 1:6001(6000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 2001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 4001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 6001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt
new file mode 100644
index 000000000000..a894b7d4559c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when not application-limited, so that
+// the cwnd continues to grow.
+// In this variant, the receiver ACKs every other packet,
+// approximating standard delayed ACKs.
+
+// Set up config. To keep things simple, disable the
+// mechanism that defers sending in order to send bigger TSO packets.
+`./defaults.sh
+sysctl -q net.ipv4.tcp_tso_win_divisor=100`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 30000) = 30000
+   +0 > P. 1:10001(10000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 2001 win 257
+   +0 > P. 10001:14001(4000) ack 1
+
++.005 < . 1:1(0) ack 4001 win 257
+   +0 > P. 14001:18001(4000) ack 1
+
++.005 < . 1:1(0) ack 6001 win 257
+   +0 > P. 18001:22001(4000) ack 1
+
++.005 < . 1:1(0) ack 8001 win 257
+   +0 > P. 22001:26001(4000) ack 1
+
++.005 < . 1:1(0) ack 10001 win 257
+   +0 > P. 26001:30001(4000) ack 1
+
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt
new file mode 100644
index 000000000000..065fae9e9abd
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when not application-limited, so that
+// the cwnd continues to grow.
+// In this variant, the receiver sends one ACK per 4 packets.
+
+// Set up config. To keep things simple, disable the
+// mechanism that defers sending in order to send bigger TSO packets.
+`./defaults.sh
+sysctl -q net.ipv4.tcp_tso_win_divisor=100`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 30000) = 30000
+   +0 > P. 1:10001(10000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+ +.11 < . 1:1(0) ack 4001 win 257
+   +0 > P. 10001:18001(8000) ack 1
+
+ +.01 < . 1:1(0) ack 8001 win 257
+   +0 > P. 18001:26001(8000) ack 1
+
++.005 < . 1:1(0) ack 10001 win 257
+   +0 > P. 26001:30001(4000) ack 1
+
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt
new file mode 100644
index 000000000000..11b213be1138
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start after idle
+// This test expects tso size to be at least initial cwnd * mss
+
+`./defaults.sh
+./set_sysctls.py /proc/sys/net/ipv4/tcp_slow_start_after_idle=1 \
+		 /proc/sys/net/ipv4/tcp_min_tso_segs=10`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 511
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 26000) = 26000
+   +0 > P. 1:5001(5000) ack 1
+   +0 > P. 5001:10001(5000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+  +.1 < . 1:1(0) ack 10001 win 511
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
+   +0 > P. 10001:20001(10000) ack 1
+   +0 > P. 20001:26001(6000) ack 1
+
+  +.1 < . 1:1(0) ack 26001 win 511
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+   +2 write(4, ..., 20000) = 20000
+// If slow start after idle works properly, we should send 5 MSS here (cwnd/2)
+   +0 > P. 26001:31001(5000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+// Reset sysctls
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt
new file mode 100644
index 000000000000..577ed8c8852c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start after window update
+// This test expects tso size to be at least initial cwnd * mss
+
+`./defaults.sh
+./set_sysctls.py /proc/sys/net/ipv4/tcp_slow_start_after_idle=1 \
+		 /proc/sys/net/ipv4/tcp_min_tso_segs=10`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 511
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 26000) = 26000
+   +0 > P. 1:5001(5000) ack 1
+   +0 > P. 5001:10001(5000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+  +.1 < . 1:1(0) ack 10001 win 511
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
+   +0 > P. 10001:20001(10000) ack 1
+   +0 > P. 20001:26001(6000) ack 1
+
+  +.1 < . 1:1(0) ack 26001 win 0
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+   +0 write(4, ..., 20000) = 20000
+// 1st win0 probe
++.3~+.310 > . 26000:26000(0) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+// 2nd win0 probe
++.6~+.620 > . 26000:26000(0) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+// 3rd win0 probe
++1.2~+1.240 > . 26000:26000(0) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+  +.9 < . 1:1(0) ack 26001 win 511
+   +0 > P. 26001:31001(5000) ack 1
+
+// Reset sysctls
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt
new file mode 100644
index 000000000000..869f32c35a2a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when application-limited: in this case,
+// with IW10, if we don't fully use our cwnd but instead
+// send just 9 packets, then cwnd should grow to twice that
+// value, or 18 packets.
+
+// Set up config.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 9000) = 9000
+   +0 > P. 1:9001(9000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 2001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 4001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 6001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 8001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 9001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt
new file mode 100644
index 000000000000..0f77b7955db6
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when application-limited: in this case,
+// with IW10, if we send exactly 10 packets then cwnd should grow to 20.
+
+// Set up config.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 2001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 4001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 6001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 8001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 10001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt
new file mode 100644
index 000000000000..7e9c83d617c2
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when not application-limited, so that
+// the cwnd continues to grow, even if TSQ triggers.
+// In this variant, the receiver ACKs every other packet,
+// approximating standard delayed ACKs.
+
+// Note we use FQ/pacing to check if TCP Small Queues is not hurting
+
+`./defaults.sh
+tc qdisc replace dev tun0 root fq
+sysctl -q net/ipv4/tcp_pacing_ss_ratio=200
+sysctl -e -q net.ipv4.tcp_min_tso_segs=2`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 500
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 40000) = 40000
+// This might change if we cook the initial packet with 10 MSS.
+   +0 > P. 1:2921(2920) ack 1
+   +0 > P. 2921:5841(2920) ack 1
+   +0 > P. 5841:8761(2920) ack 1
+   +0 > P. 8761:11681(2920) ack 1
+   +0 > P. 11681:14601(2920) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 2921 win 500
+   +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }%
+
+// Note: after this commit : "net_sched: sch_fq: account for schedule/timers drifts"
+// FQ notices that this packet missed the 'time to send next packet' computed
+// when prior packet (11681:14601(2920)) was sent.
+// So FQ will allow following packet to be sent a bit earlier (quantum/2)
+// (FQ commit allows an application/cwnd limited flow to get at most quantum/2 extra credit)
+   +0 > P. 14601:17521(2920) ack 1
+
++.003 < . 1:1(0) ack 5841 win 500
+   +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }%
+
++.001 > P. 17521:20441(2920) ack 1
+
++.001 < . 1:1(0) ack 8761 win 500
+   +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }%
+
+// remaining packets are delivered at a constant rate.
++.007 > P. 20441:23361(2920) ack 1
+
++.002 < . 1:1(0) ack 11681 win 500
+   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
++.001 < . 1:1(0) ack 14601 win 500
+
++.004 > P. 23361:26281(2920) ack 1
+
++.007 > P. 26281:29201(2920) ack 1
+
+   +0 %{ assert tcpi_snd_cwnd == 20, 'cwnd=%d' % tcpi_snd_cwnd }%
-- 
cgit v1.2.3


From ea02a946873ba6709080e9db7b22449f1ac69fd4 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Fri, 13 Sep 2024 00:19:02 +0000
Subject: libbpf: Add bpf_object__token_fd accessor

Add a LIBBPF_API function to retrieve the token_fd from a bpf_object.

Without this accessor, if user needs a token FD they have to get it
manually via bpf_token_create, even though a token might have been
already created by bpf_object__load.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240913001858.3345583-1-ihor.solodrai@pm.me
---
 tools/lib/bpf/libbpf.c   | 5 +++++
 tools/lib/bpf/libbpf.h   | 8 ++++++++
 tools/lib/bpf/libbpf.map | 1 +
 3 files changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 84d4ec0e1f60..219facd0e66e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9059,6 +9059,11 @@ unsigned int bpf_object__kversion(const struct bpf_object *obj)
 	return obj ? obj->kern_version : 0;
 }
 
+int bpf_object__token_fd(const struct bpf_object *obj)
+{
+	return obj->token_fd ?: -1;
+}
+
 struct btf *bpf_object__btf(const struct bpf_object *obj)
 {
 	return obj ? obj->btf : NULL;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 6917653ef9fa..91484303849c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -294,6 +294,14 @@ LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj);
 LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj);
 LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version);
 
+/**
+ * @brief **bpf_object__token_fd** is an accessor for BPF token FD associated
+ * with BPF object.
+ * @param obj Pointer to a valid BPF object
+ * @return BPF token FD or -1, if it wasn't set
+ */
+LIBBPF_API int bpf_object__token_fd(const struct bpf_object *obj);
+
 struct btf;
 LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj);
 LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 8f0d9ea3b1b4..0096e483f7eb 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -423,6 +423,7 @@ LIBBPF_1.5.0 {
 		btf__relocate;
 		bpf_map__autoattach;
 		bpf_map__set_autoattach;
+		bpf_object__token_fd;
 		bpf_program__attach_sockmap;
 		ring__consume_n;
 		ring_buffer__consume_n;
-- 
cgit v1.2.3


From 2aec90036dcd2cb7047a6e28625fba6c64756665 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 5 Sep 2024 19:17:24 +0200
Subject: selftests: vDSO: ensure vgetrandom works in a time namespace

After verifying that vDSO getrandom does work, which ensures that the
RNG is initialized, test to see if it also works inside of a time
namespace. This is important to test, because the vvar pages get
swizzled around there. If the arch code isn't careful, the RNG will
appear uninitialized inside of a time namespace.

Because broken code makes the RNG appear uninitialized, test that
everything works by issuing a call to vgetrandom from a fork in a time
namespace, and use ptrace to ensure that the actual syscall getrandom
doesn't get called. If it doesn't get called, then the test succeeds.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_test_getrandom.c | 43 +++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
index 8866b65a4605..72a1d9b43a84 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
@@ -16,8 +16,12 @@
 #include <sys/mman.h>
 #include <sys/random.h>
 #include <sys/syscall.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
 #include <sys/types.h>
 #include <linux/random.h>
+#include <linux/compiler.h>
+#include <linux/ptrace.h>
 
 #include "../kselftest.h"
 #include "parse_vdso.h"
@@ -239,9 +243,10 @@ static void fill(void)
 static void kselftest(void)
 {
 	uint8_t weird_size[1263];
+	pid_t child;
 
 	ksft_print_header();
-	ksft_set_plan(1);
+	ksft_set_plan(2);
 
 	for (size_t i = 0; i < 1000; ++i) {
 		ssize_t ret = vgetrandom(weird_size, sizeof(weird_size), 0);
@@ -250,6 +255,42 @@ static void kselftest(void)
 	}
 
 	ksft_test_result_pass("getrandom: PASS\n");
+
+	unshare(CLONE_NEWUSER);
+	assert(unshare(CLONE_NEWTIME) == 0);
+	child = fork();
+	assert(child >= 0);
+	if (!child) {
+		vgetrandom_init();
+		child = getpid();
+		assert(ptrace(PTRACE_TRACEME, 0, NULL, NULL) == 0);
+		assert(kill(child, SIGSTOP) == 0);
+		assert(vgetrandom(weird_size, sizeof(weird_size), 0) == sizeof(weird_size));
+		_exit(0);
+	}
+	for (;;) {
+		struct ptrace_syscall_info info = { 0 };
+		int status, ret;
+		assert(waitpid(child, &status, 0) >= 0);
+		if (WIFEXITED(status)) {
+			if (WEXITSTATUS(status) != 0)
+				exit(KSFT_FAIL);
+			break;
+		}
+		assert(WIFSTOPPED(status));
+		if (WSTOPSIG(status) == SIGSTOP)
+			assert(ptrace(PTRACE_SETOPTIONS, child, 0, PTRACE_O_TRACESYSGOOD) == 0);
+		else if (WSTOPSIG(status) == (SIGTRAP | 0x80)) {
+			assert(ptrace(PTRACE_GET_SYSCALL_INFO, child, sizeof(info), &info) > 0);
+			if (info.op == PTRACE_SYSCALL_INFO_ENTRY && info.entry.nr == __NR_getrandom &&
+			    info.entry.args[0] == (uintptr_t)weird_size && info.entry.args[1] == sizeof(weird_size))
+				exit(KSFT_FAIL);
+		}
+		assert(ptrace(PTRACE_SYSCALL, child, 0, 0) == 0);
+	}
+
+	ksft_test_result_pass("getrandom timens: PASS\n");
+
 	exit(KSFT_PASS);
 }
 
-- 
cgit v1.2.3


From 6fd13b282f62bd840f2410692deaa23a42fd91fa Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 27 Aug 2024 17:14:18 +0200
Subject: random: vDSO: move prototype of arch chacha function to
 vdso/getrandom.h

Having the prototype for __arch_chacha20_blocks_nostack in
arch/x86/include/asm/vdso/getrandom.h meant that the prototype and large
doc comment were cloned by every architecture, which has been causing
unnecessary churn. Instead move it into include/vdso/getrandom.h, where
it can be shared by all archs implementing it.

As a side bonus, this then lets us use that prototype in the
vdso_test_chacha self test, to ensure that it matches the source, and
indeed doing so turned up some inconsistencies, which are rectified
here.

Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/x86/include/asm/vdso/getrandom.h           | 13 -------------
 include/vdso/getrandom.h                        | 13 +++++++++++++
 tools/testing/selftests/vDSO/vdso_test_chacha.c | 10 +++++++---
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h
index b96e674cafde..ff5334ad32a0 100644
--- a/arch/x86/include/asm/vdso/getrandom.h
+++ b/arch/x86/include/asm/vdso/getrandom.h
@@ -37,19 +37,6 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void
 	return &__vdso_rng_data;
 }
 
-/**
- * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack.
- * @dst_bytes:	Destination buffer to hold @nblocks * 64 bytes of output.
- * @key:	32-byte input key.
- * @counter:	8-byte counter, read on input and updated on return.
- * @nblocks:	Number of blocks to generate.
- *
- * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write
- * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data
- * leaking into forked child processes.
- */
-extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks);
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/include/vdso/getrandom.h b/include/vdso/getrandom.h
index a8b7c14b0ae0..4cf02e678f5e 100644
--- a/include/vdso/getrandom.h
+++ b/include/vdso/getrandom.h
@@ -43,4 +43,17 @@ struct vgetrandom_state {
 	bool 			in_use;
 };
 
+/**
+ * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack.
+ * @dst_bytes:	Destination buffer to hold @nblocks * 64 bytes of output.
+ * @key:	32-byte input key.
+ * @counter:	8-byte counter, read on input and updated on return.
+ * @nblocks:	Number of blocks to generate.
+ *
+ * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write
+ * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data
+ * leaking into forked child processes.
+ */
+extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks);
+
 #endif /* _VDSO_GETRANDOM_H */
diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c
index e38f44e5f803..ca5639d02969 100644
--- a/tools/testing/selftests/vDSO/vdso_test_chacha.c
+++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c
@@ -7,16 +7,20 @@
 #include <sys/random.h>
 #include <string.h>
 #include <stdint.h>
+#include <stdbool.h>
 #include "../kselftest.h"
 
-extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks);
+typedef uint8_t u8;
+typedef uint32_t u32;
+typedef uint64_t u64;
+#include <vdso/getrandom.h>
 
 int main(int argc, char *argv[])
 {
 	enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 };
 	static const uint8_t nonce[8] = { 0 };
 	uint32_t counter[2];
-	uint8_t key[32];
+	uint32_t key[8];
 	uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS];
 
 	ksft_print_header();
@@ -27,7 +31,7 @@ int main(int argc, char *argv[])
 			printf("getrandom() failed!\n");
 			return KSFT_SKIP;
 		}
-		crypto_stream_chacha20(output1, sizeof(output1), nonce, key);
+		crypto_stream_chacha20(output1, sizeof(output1), nonce, (uint8_t *)key);
 		for (unsigned int split = 0; split < BLOCKS; ++split) {
 			memset(output2, 'X', sizeof(output2));
 			memset(counter, 0, sizeof(counter));
-- 
cgit v1.2.3


From 7fe5b3e4e7a61f3640d7645c434ac8c982f1388e Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 28 Aug 2024 15:49:32 +0200
Subject: selftests: vDSO: open code basic chacha instead of linking to
 libsodium

Linking to libsodium makes building this test annoying in cross
compilation environments and is just way too much. Since this is just a
basic correctness test, simply open code a simple, unoptimized, dumb
chacha, rather than linking to libsodium.

This also fixes a correctness issue on big endian systems. The kernel's
random.c doesn't bother doing a le32_to_cpu operation on the random
bytes that are passed as the key, and consequently neither does
vgetrandom-chacha.S. However, libsodium's chacha _does_ do this, since
it takes the key as an array of bytes. This meant that the test was
broken on big endian systems, which this commit rectifies.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/Makefile           |  7 +---
 tools/testing/selftests/vDSO/vdso_test_chacha.c | 47 +++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 32ea4c2a9a23..e21e78aae24d 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 uname_M := $(shell uname -m 2>/dev/null || echo not)
 ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
-SODIUM_LIBS := $(shell pkg-config --libs libsodium 2>/dev/null)
-SODIUM_CFLAGS := $(shell pkg-config --cflags libsodium 2>/dev/null)
 
 TEST_GEN_PROGS := vdso_test_gettimeofday
 TEST_GEN_PROGS += vdso_test_getcpu
@@ -14,10 +12,8 @@ endif
 TEST_GEN_PROGS += vdso_test_correctness
 ifeq ($(uname_M),x86_64)
 TEST_GEN_PROGS += vdso_test_getrandom
-ifneq ($(SODIUM_LIBS),)
 TEST_GEN_PROGS += vdso_test_chacha
 endif
-endif
 
 CFLAGS := -std=gnu99
 
@@ -43,8 +39,7 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
                                          -isystem $(top_srcdir)/include/uapi
 
 $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S
-$(OUTPUT)/vdso_test_chacha: LDLIBS += $(SODIUM_LIBS)
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
                                       -idirafter $(top_srcdir)/arch/$(ARCH)/include \
                                       -idirafter $(top_srcdir)/include \
-                                      -D__ASSEMBLY__ -Wa,--noexecstack $(SODIUM_CFLAGS)
+                                      -D__ASSEMBLY__ -Wa,--noexecstack
diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c
index ca5639d02969..f8603650d26b 100644
--- a/tools/testing/selftests/vDSO/vdso_test_chacha.c
+++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c
@@ -3,13 +3,55 @@
  * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
-#include <sodium/crypto_stream_chacha20.h>
+#include <tools/le_byteshift.h>
 #include <sys/random.h>
 #include <string.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include "../kselftest.h"
 
+static uint32_t rol32(uint32_t word, unsigned int shift)
+{
+	return (word << (shift & 31)) | (word >> ((-shift) & 31));
+}
+
+static void reference_chacha20_blocks(uint8_t *dst_bytes, const uint32_t *key, size_t nblocks)
+{
+	uint32_t s[16] = {
+		0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U,
+		key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7]
+	};
+
+	while (nblocks--) {
+		uint32_t x[16];
+		memcpy(x, s, sizeof(x));
+		for (unsigned int r = 0; r < 20; r += 2) {
+		#define QR(a, b, c, d) ( \
+			x[a] += x[b], \
+			x[d] = rol32(x[d] ^ x[a], 16), \
+			x[c] += x[d], \
+			x[b] = rol32(x[b] ^ x[c], 12), \
+			x[a] += x[b], \
+			x[d] = rol32(x[d] ^ x[a], 8), \
+			x[c] += x[d], \
+			x[b] = rol32(x[b] ^ x[c], 7))
+
+			QR(0, 4, 8, 12);
+			QR(1, 5, 9, 13);
+			QR(2, 6, 10, 14);
+			QR(3, 7, 11, 15);
+			QR(0, 5, 10, 15);
+			QR(1, 6, 11, 12);
+			QR(2, 7, 8, 13);
+			QR(3, 4, 9, 14);
+		}
+		for (unsigned int i = 0; i < 16; ++i, dst_bytes += sizeof(uint32_t))
+			put_unaligned_le32(x[i] + s[i], dst_bytes);
+		if (!++s[12])
+			++s[13];
+	}
+}
+
 typedef uint8_t u8;
 typedef uint32_t u32;
 typedef uint64_t u64;
@@ -18,7 +60,6 @@ typedef uint64_t u64;
 int main(int argc, char *argv[])
 {
 	enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 };
-	static const uint8_t nonce[8] = { 0 };
 	uint32_t counter[2];
 	uint32_t key[8];
 	uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS];
@@ -31,7 +72,7 @@ int main(int argc, char *argv[])
 			printf("getrandom() failed!\n");
 			return KSFT_SKIP;
 		}
-		crypto_stream_chacha20(output1, sizeof(output1), nonce, (uint8_t *)key);
+		reference_chacha20_blocks(output1, key, BLOCKS);
 		for (unsigned int split = 0; split < BLOCKS; ++split) {
 			memset(output2, 'X', sizeof(output2));
 			memset(counter, 0, sizeof(counter));
-- 
cgit v1.2.3


From 67a121ac8fe136f17e505e6595b8cf1a1f65f40c Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 1 Sep 2024 15:05:01 +0200
Subject: selftests: vDSO: fix cross build for getrandom and chacha tests

Unlike the check for the standalone x86 test, the check for building the
vDSO getrandom and chacaha tests looks at the architecture for the host
rather than the architecture for the target when deciding if they should
be built. Since the chacha test includes some assembler code this means
that cross building with x86 as either the target or host is broken.

There's also some additional complications, where ARCH can legitimately
be either x86_64 or x86, but the source code we need to compile lives in
a directory path containing arch/x86. The standard SRCARCH variable
handles that. And actually, all these variables and proper substitutions
are already described in tools/scripts/Makefile.arch, so just include
that to handle it.

Similarly, ARCH=x86 can actually describe ARCH=x86_64,
just with CONFIG_64BIT, so we can't rely on ARCH for selecting
non-32-bit tests. For that, check against $(ARCH)$(CONFIG_X86_32). This
won't help for people manually running this inside the vDSO selftest
directory (which isn't really supported anyway and has problems on
various archs), but it should work for builds of the kselftests, where
the CONFIG_* variables are defined. On x86_64 machines,
$(ARCH)$(CONFIG_X86_32) will evaluate to x86. On arm64 machines, it will
evaluate to arm64. On 32-bit x86 machines, it will evaluate to x86y,
which won't match the filter list.

Reported-by: Mark Brown <broonie@kernel.org>
Reported-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/Makefile | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index e21e78aae24d..bd5005a7b37a 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-uname_M := $(shell uname -m 2>/dev/null || echo not)
-ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+include ../../../scripts/Makefile.arch
 
 TEST_GEN_PROGS := vdso_test_gettimeofday
 TEST_GEN_PROGS += vdso_test_getcpu
@@ -10,7 +9,7 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
 TEST_GEN_PROGS += vdso_standalone_test_x86
 endif
 TEST_GEN_PROGS += vdso_test_correctness
-ifeq ($(uname_M),x86_64)
+ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64))
 TEST_GEN_PROGS += vdso_test_getrandom
 TEST_GEN_PROGS += vdso_test_chacha
 endif
@@ -38,8 +37,8 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
                                          $(KHDR_INCLUDES) \
                                          -isystem $(top_srcdir)/include/uapi
 
-$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S
+$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(SRCARCH)/vdso/vgetrandom-chacha.S
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
-                                      -idirafter $(top_srcdir)/arch/$(ARCH)/include \
+                                      -idirafter $(top_srcdir)/arch/$(SRCARCH)/include \
                                       -idirafter $(top_srcdir)/include \
                                       -D__ASSEMBLY__ -Wa,--noexecstack
-- 
cgit v1.2.3


From 18efd0b10e0fd77fe649a375f9f17a387b5d1609 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Sun, 1 Sep 2024 14:13:11 +0800
Subject: LoongArch: vDSO: Wire up getrandom() vDSO implementation

Hook up the generic vDSO implementation to the LoongArch vDSO data page
by providing the required __arch_chacha20_blocks_nostack,
__arch_get_k_vdso_rng_data, and getrandom_syscall implementations. Also
wire up the selftests.

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Acked-by: Huacai Chen <chenhuacai@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/loongarch/Kconfig                      |   1 +
 arch/loongarch/include/asm/vdso/getrandom.h |  39 +++++
 arch/loongarch/include/asm/vdso/vdso.h      |   6 +
 arch/loongarch/include/asm/vdso/vsyscall.h  |   8 +
 arch/loongarch/kernel/vdso.c                |   1 +
 arch/loongarch/vdso/Makefile                |   7 +-
 arch/loongarch/vdso/vdso.lds.S              |   1 +
 arch/loongarch/vdso/vgetrandom-chacha.S     | 242 ++++++++++++++++++++++++++++
 arch/loongarch/vdso/vgetrandom.c            |  10 ++
 tools/arch/loongarch/vdso                   |   1 +
 tools/testing/selftests/vDSO/Makefile       |   2 +-
 11 files changed, 316 insertions(+), 2 deletions(-)
 create mode 100644 arch/loongarch/include/asm/vdso/getrandom.h
 create mode 100644 arch/loongarch/vdso/vgetrandom-chacha.S
 create mode 100644 arch/loongarch/vdso/vgetrandom.c
 create mode 120000 tools/arch/loongarch/vdso

(limited to 'tools')

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 70f169210b52..14821c2aba5b 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -190,6 +190,7 @@ config LOONGARCH
 	select TRACE_IRQFLAGS_SUPPORT
 	select USE_PERCPU_NUMA_NODE_ID
 	select USER_STACKTRACE_SUPPORT
+	select VDSO_GETRANDOM
 	select ZONE_DMA32
 
 config 32BIT
diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..02f36772541b
--- /dev/null
+++ b/arch/loongarch/include/asm/vdso/getrandom.h
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/unistd.h>
+#include <asm/vdso/vdso.h>
+
+static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
+{
+	register long ret asm("a0");
+	register long nr asm("a7") = __NR_getrandom;
+	register void *buffer asm("a0") = _buffer;
+	register size_t len asm("a1") = _len;
+	register unsigned int flags asm("a2") = _flags;
+
+	asm volatile(
+	"      syscall 0\n"
+	: "+r" (ret)
+	: "r" (nr), "r" (buffer), "r" (len), "r" (flags)
+	: "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8",
+	  "memory");
+
+	return ret;
+}
+
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+	return (const struct vdso_rng_data *)(get_vdso_data() + VVAR_LOONGARCH_PAGES_START *
+	       PAGE_SIZE + offsetof(struct loongarch_vdso_data, rng_data));
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/loongarch/include/asm/vdso/vdso.h b/arch/loongarch/include/asm/vdso/vdso.h
index 5a12309d9fb5..e31ac7474513 100644
--- a/arch/loongarch/include/asm/vdso/vdso.h
+++ b/arch/loongarch/include/asm/vdso/vdso.h
@@ -4,6 +4,9 @@
  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  */
 
+#ifndef _ASM_VDSO_VDSO_H
+#define _ASM_VDSO_VDSO_H
+
 #ifndef __ASSEMBLY__
 
 #include <asm/asm.h>
@@ -16,6 +19,7 @@ struct vdso_pcpu_data {
 
 struct loongarch_vdso_data {
 	struct vdso_pcpu_data pdata[NR_CPUS];
+	struct vdso_rng_data rng_data;
 };
 
 /*
@@ -63,3 +67,5 @@ static inline unsigned long get_vdso_data(void)
 }
 
 #endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/loongarch/include/asm/vdso/vsyscall.h b/arch/loongarch/include/asm/vdso/vsyscall.h
index 5de615383a22..b1273ce6f140 100644
--- a/arch/loongarch/include/asm/vdso/vsyscall.h
+++ b/arch/loongarch/include/asm/vdso/vsyscall.h
@@ -8,6 +8,7 @@
 #include <vdso/datapage.h>
 
 extern struct vdso_data *vdso_data;
+extern struct vdso_rng_data *vdso_rng_data;
 
 /*
  * Update the vDSO data page to keep in sync with kernel timekeeping.
@@ -19,6 +20,13 @@ struct vdso_data *__loongarch_get_k_vdso_data(void)
 }
 #define __arch_get_k_vdso_data __loongarch_get_k_vdso_data
 
+static __always_inline
+struct vdso_rng_data *__loongarch_get_k_vdso_rng_data(void)
+{
+	return vdso_rng_data;
+}
+#define __arch_get_k_vdso_rng_data __loongarch_get_k_vdso_rng_data
+
 /* The asm-generic header needs to be included after the definitions above */
 #include <asm-generic/vdso/vsyscall.h>
 
diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c
index 90dfccb41c14..f6fcc52aefae 100644
--- a/arch/loongarch/kernel/vdso.c
+++ b/arch/loongarch/kernel/vdso.c
@@ -37,6 +37,7 @@ static union {
 static struct page *vdso_pages[] = { NULL };
 struct vdso_data *vdso_data = generic_vdso_data.data;
 struct vdso_pcpu_data *vdso_pdata = loongarch_vdso_data.vdata.pdata;
+struct vdso_rng_data *vdso_rng_data = &loongarch_vdso_data.vdata.rng_data;
 
 static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
 {
diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
index d724d46b07c8..40c1175823d6 100644
--- a/arch/loongarch/vdso/Makefile
+++ b/arch/loongarch/vdso/Makefile
@@ -4,7 +4,8 @@
 # Include the generic Makefile to check the built vdso.
 include $(srctree)/lib/vdso/Makefile
 
-obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o
+obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o vgetrandom.o \
+              vgetrandom-chacha.o sigreturn.o
 
 # Common compiler flags between ABIs.
 ccflags-vdso := \
@@ -29,6 +30,10 @@ ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
 endif
 
+ifneq ($(c-getrandom-y),)
+  CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
+endif
+
 # VDSO linker flags.
 ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \
 	$(filter -E%,$(KBUILD_CFLAGS)) -nostdlib -shared \
diff --git a/arch/loongarch/vdso/vdso.lds.S b/arch/loongarch/vdso/vdso.lds.S
index 56ad855896de..6b441bde4026 100644
--- a/arch/loongarch/vdso/vdso.lds.S
+++ b/arch/loongarch/vdso/vdso.lds.S
@@ -62,6 +62,7 @@ VERSION
 		__vdso_clock_getres;
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
+		__vdso_getrandom;
 		__vdso_rt_sigreturn;
 	local: *;
 	};
diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..7e86a50f6e85
--- /dev/null
+++ b/arch/loongarch/vdso/vgetrandom-chacha.S
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+
+#include <asm/asm.h>
+#include <asm/regdef.h>
+#include <linux/linkage.h>
+
+.text
+
+/* Salsa20 quarter-round */
+.macro	QR	a b c d
+	add.w		\a, \a, \b
+	xor		\d, \d, \a
+	rotri.w		\d, \d, 16
+
+	add.w		\c, \c, \d
+	xor		\b, \b, \c
+	rotri.w		\b, \b, 20
+
+	add.w		\a, \a, \b
+	xor		\d, \d, \a
+	rotri.w		\d, \d, 24
+
+	add.w		\c, \c, \d
+	xor		\b, \b, \c
+	rotri.w		\b, \b, 25
+.endm
+
+/*
+ * Very basic LoongArch implementation of ChaCha20. Produces a given positive
+ * number of blocks of output with a nonce of 0, taking an input key and
+ * 8-byte counter. Importantly does not spill to the stack. Its arguments
+ * are:
+ *
+ *	a0: output bytes
+ *	a1: 32-byte key input
+ *	a2: 8-byte counter input/output
+ *	a3: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+/* We don't need a frame pointer */
+#define s9		fp
+
+#define output		a0
+#define key		a1
+#define counter		a2
+#define nblocks		a3
+#define i		a4
+#define state0		s0
+#define state1		s1
+#define state2		s2
+#define state3		s3
+#define state4		s4
+#define state5		s5
+#define state6		s6
+#define state7		s7
+#define state8		s8
+#define state9		s9
+#define state10		a5
+#define state11		a6
+#define state12		a7
+#define state13		t0
+#define state14		t1
+#define state15		t2
+#define cnt_lo		t3
+#define cnt_hi		t4
+#define copy0		t5
+#define copy1		t6
+#define copy2		t7
+
+/* Reuse i as copy3 */
+#define copy3		i
+
+	/*
+	 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
+	 * This does not violate the stack-less requirement: no sensitive data
+	 * is spilled onto the stack.
+	 */
+	PTR_ADDI	sp, sp, (-SZREG * 10) & STACK_ALIGN
+	REG_S		s0, sp, 0
+	REG_S		s1, sp, SZREG
+	REG_S		s2, sp, SZREG * 2
+	REG_S		s3, sp, SZREG * 3
+	REG_S		s4, sp, SZREG * 4
+	REG_S		s5, sp, SZREG * 5
+	REG_S		s6, sp, SZREG * 6
+	REG_S		s7, sp, SZREG * 7
+	REG_S		s8, sp, SZREG * 8
+	REG_S		s9, sp, SZREG * 9
+
+	li.w		copy0, 0x61707865
+	li.w		copy1, 0x3320646e
+	li.w		copy2, 0x79622d32
+
+	ld.w		cnt_lo, counter, 0
+	ld.w		cnt_hi, counter, 4
+
+.Lblock:
+	/* state[0,1,2,3] = "expand 32-byte k" */
+	move		state0, copy0
+	move		state1, copy1
+	move		state2, copy2
+	li.w		state3, 0x6b206574
+
+	/* state[4,5,..,11] = key */
+	ld.w		state4, key, 0
+	ld.w		state5, key, 4
+	ld.w		state6, key, 8
+	ld.w		state7, key, 12
+	ld.w		state8, key, 16
+	ld.w		state9, key, 20
+	ld.w		state10, key, 24
+	ld.w		state11, key, 28
+
+	/* state[12,13] = counter */
+	move		state12, cnt_lo
+	move		state13, cnt_hi
+
+	/* state[14,15] = 0 */
+	move		state14, zero
+	move		state15, zero
+
+	li.w		i, 10
+.Lpermute:
+	/* odd round */
+	QR		state0, state4, state8, state12
+	QR		state1, state5, state9, state13
+	QR		state2, state6, state10, state14
+	QR		state3, state7, state11, state15
+
+	/* even round */
+	QR		state0, state5, state10, state15
+	QR		state1, state6, state11, state12
+	QR		state2, state7, state8, state13
+	QR		state3, state4, state9, state14
+
+	addi.w		i, i, -1
+	bnez		i, .Lpermute
+
+	/*
+	 * copy[3] = "expa", materialize it here because copy[3] shares the
+	 * same register with i which just became dead.
+	 */
+	li.w		copy3, 0x6b206574
+
+	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
+	add.w		state0, state0, copy0
+	add.w		state1, state1, copy1
+	add.w		state2, state2, copy2
+	add.w		state3, state3, copy3
+	st.w		state0, output, 0
+	st.w		state1, output, 4
+	st.w		state2, output, 8
+	st.w		state3, output, 12
+
+	/* from now on state[0,1,2,3] are scratch registers  */
+
+	/* state[0,1,2,3] = lo32(key) */
+	ld.w		state0, key, 0
+	ld.w		state1, key, 4
+	ld.w		state2, key, 8
+	ld.w		state3, key, 12
+
+	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
+	add.w		state4, state4, state0
+	add.w		state5, state5, state1
+	add.w		state6, state6, state2
+	add.w		state7, state7, state3
+	st.w		state4, output, 16
+	st.w		state5, output, 20
+	st.w		state6, output, 24
+	st.w		state7, output, 28
+
+	/* state[0,1,2,3] = hi32(key) */
+	ld.w		state0, key, 16
+	ld.w		state1, key, 20
+	ld.w		state2, key, 24
+	ld.w		state3, key, 28
+
+	/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
+	add.w		state8, state8, state0
+	add.w		state9, state9, state1
+	add.w		state10, state10, state2
+	add.w		state11, state11, state3
+	st.w		state8, output, 32
+	st.w		state9, output, 36
+	st.w		state10, output, 40
+	st.w		state11, output, 44
+
+	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
+	add.w		state12, state12, cnt_lo
+	add.w		state13, state13, cnt_hi
+	st.w		state12, output, 48
+	st.w		state13, output, 52
+	st.w		state14, output, 56
+	st.w		state15, output, 60
+
+	/* ++counter  */
+	addi.w		cnt_lo, cnt_lo, 1
+	sltui		state0, cnt_lo, 1
+	add.w		cnt_hi, cnt_hi, state0
+
+	/* output += 64 */
+	PTR_ADDI	output, output, 64
+	/* --nblocks */
+	PTR_ADDI	nblocks, nblocks, -1
+	bnez		nblocks, .Lblock
+
+	/* counter = [cnt_lo, cnt_hi] */
+	st.w		cnt_lo, counter, 0
+	st.w		cnt_hi, counter, 4
+
+	/*
+	 * Zero out the potentially sensitive regs, in case nothing uses these
+	 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
+	 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
+	 * only need to zero state[11,...,15].
+	 */
+	move		state10, zero
+	move		state11, zero
+	move		state12, zero
+	move		state13, zero
+	move		state14, zero
+	move		state15, zero
+
+	REG_L		s0, sp, 0
+	REG_L		s1, sp, SZREG
+	REG_L		s2, sp, SZREG * 2
+	REG_L		s3, sp, SZREG * 3
+	REG_L		s4, sp, SZREG * 4
+	REG_L		s5, sp, SZREG * 5
+	REG_L		s6, sp, SZREG * 6
+	REG_L		s7, sp, SZREG * 7
+	REG_L		s8, sp, SZREG * 8
+	REG_L		s9, sp, SZREG * 9
+	PTR_ADDI	sp, sp, -((-SZREG * 10) & STACK_ALIGN)
+
+	jr		ra
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/loongarch/vdso/vgetrandom.c b/arch/loongarch/vdso/vgetrandom.c
new file mode 100644
index 000000000000..d5f258ac4a36
--- /dev/null
+++ b/arch/loongarch/vdso/vgetrandom.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#include <linux/types.h>
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+	return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+}
diff --git a/tools/arch/loongarch/vdso b/tools/arch/loongarch/vdso
new file mode 120000
index 000000000000..ebda43a82db7
--- /dev/null
+++ b/tools/arch/loongarch/vdso
@@ -0,0 +1 @@
+../../../arch/loongarch/vdso
\ No newline at end of file
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index bd5005a7b37a..d586d3e7a7c1 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -9,7 +9,7 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
 TEST_GEN_PROGS += vdso_standalone_test_x86
 endif
 TEST_GEN_PROGS += vdso_test_correctness
-ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64))
+ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch))
 TEST_GEN_PROGS += vdso_test_getrandom
 TEST_GEN_PROGS += vdso_test_chacha
 endif
-- 
cgit v1.2.3


From ecb8bd70d51ccf9009219a6097cef293deada65b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Sun, 1 Sep 2024 19:24:03 +0200
Subject: selftests: vDSO: build tests with O2 optimization

Without -O2, the generated code for testing chacha function is awful.
GCC even implements rol32() as a function of 20 instructions instead of
just using the rotlwi instruction.

	~# time ./vdso_test_chacha
	TAP version 13
	1..1
	ok 1 chacha: PASS
	real    0m 37.16s
	user    0m 36.89s
	sys     0m 0.26s

Several other selftests directory add -O2, and the kernel is also
always built with optimisation active. Do the same for vDSO selftests.

With this patch the time is reduced by approximately 15%.

	~# time ./vdso_test_chacha
	TAP version 13
	1..1
	ok 1 chacha: PASS
	real    0m 32.09s
	user    0m 31.86s
	sys     0m 0.22s

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index d586d3e7a7c1..04930125035e 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -14,7 +14,7 @@ TEST_GEN_PROGS += vdso_test_getrandom
 TEST_GEN_PROGS += vdso_test_chacha
 endif
 
-CFLAGS := -std=gnu99
+CFLAGS := -std=gnu99 -O2
 
 ifeq ($(CONFIG_X86_32),y)
 LDLIBS += -lgcc_s
-- 
cgit v1.2.3


From bb10ffe01bf2d2436190958c7eae694515e1ebb7 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Sun, 1 Sep 2024 19:40:33 +0200
Subject: selftests: vDSO: also test counter in vdso_test_chacha

The chacha vDSO selftest doesn't check the way the counter is handled by
__arch_chacha20_blocks_nostack(). It indirectly checks that the counter
is writen on exit and read back on new entry, but it doesn't check that
the format is correct. When implementing this function on powerpc, I
missed a case where the counter was writen and read in wrong byte order.

Also, the counter uses two words, but the tests with a zero counter and
uses a small amount of blocks, so at the end the upper part of the
counter is always 0, so it is not checked.

Add a verification of counter's content in addition to the verification
of the output.

Also add two tests where the counter crosses the u32 upper limit. The
first test verifies that the function properly writes back the upper
word, the second test verifies that the function properly reads back the
upper word.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_test_chacha.c | 36 ++++++++++++++++++-------
 1 file changed, 27 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c
index f8603650d26b..e81d72c9882e 100644
--- a/tools/testing/selftests/vDSO/vdso_test_chacha.c
+++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c
@@ -15,11 +15,12 @@ static uint32_t rol32(uint32_t word, unsigned int shift)
 	return (word << (shift & 31)) | (word >> ((-shift) & 31));
 }
 
-static void reference_chacha20_blocks(uint8_t *dst_bytes, const uint32_t *key, size_t nblocks)
+static void reference_chacha20_blocks(uint8_t *dst_bytes, const uint32_t *key, uint32_t *counter, size_t nblocks)
 {
 	uint32_t s[16] = {
 		0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U,
-		key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7]
+		key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7],
+		counter[0], counter[1], 0, 0
 	};
 
 	while (nblocks--) {
@@ -50,6 +51,8 @@ static void reference_chacha20_blocks(uint8_t *dst_bytes, const uint32_t *key, s
 		if (!++s[12])
 			++s[13];
 	}
+	counter[0] = s[12];
+	counter[1] = s[13];
 }
 
 typedef uint8_t u8;
@@ -60,8 +63,7 @@ typedef uint64_t u64;
 int main(int argc, char *argv[])
 {
 	enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 };
-	uint32_t counter[2];
-	uint32_t key[8];
+	uint32_t key[8], counter1[2], counter2[2];
 	uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS];
 
 	ksft_print_header();
@@ -72,17 +74,33 @@ int main(int argc, char *argv[])
 			printf("getrandom() failed!\n");
 			return KSFT_SKIP;
 		}
-		reference_chacha20_blocks(output1, key, BLOCKS);
+		memset(counter1, 0, sizeof(counter1));
+		reference_chacha20_blocks(output1, key, counter1, BLOCKS);
 		for (unsigned int split = 0; split < BLOCKS; ++split) {
 			memset(output2, 'X', sizeof(output2));
-			memset(counter, 0, sizeof(counter));
+			memset(counter2, 0, sizeof(counter2));
 			if (split)
-				__arch_chacha20_blocks_nostack(output2, key, counter, split);
-			__arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split);
-			if (memcmp(output1, output2, sizeof(output1)))
+				__arch_chacha20_blocks_nostack(output2, key, counter2, split);
+			__arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter2, BLOCKS - split);
+			if (memcmp(output1, output2, sizeof(output1)) || memcmp(counter1, counter2, sizeof(counter1)))
 				return KSFT_FAIL;
 		}
 	}
+	memset(counter1, 0, sizeof(counter1));
+	counter1[0] = (uint32_t)-BLOCKS + 2;
+	memset(counter2, 0, sizeof(counter2));
+	counter2[0] = (uint32_t)-BLOCKS + 2;
+
+	reference_chacha20_blocks(output1, key, counter1, BLOCKS);
+	__arch_chacha20_blocks_nostack(output2, key, counter2, BLOCKS);
+	if (memcmp(output1, output2, sizeof(output1)) || memcmp(counter1, counter2, sizeof(counter1)))
+		return KSFT_FAIL;
+
+	reference_chacha20_blocks(output1, key, counter1, BLOCKS);
+	__arch_chacha20_blocks_nostack(output2, key, counter2, BLOCKS);
+	if (memcmp(output1, output2, sizeof(output1)) || memcmp(counter1, counter2, sizeof(counter1)))
+		return KSFT_FAIL;
+
 	ksft_test_result_pass("chacha: PASS\n");
 	return KSFT_PASS;
 }
-- 
cgit v1.2.3


From 712676ea2bb3882a852bcf49862c4247317fc9b2 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Tue, 3 Sep 2024 12:09:17 +0000
Subject: arm64: vDSO: Wire up getrandom() vDSO implementation

Hook up the generic vDSO implementation to the aarch64 vDSO data page.
The _vdso_rng_data required data is placed within the _vdso_data vvar
page, by using a offset larger than the vdso_data.

The vDSO function requires a ChaCha20 implementation that does not write
to the stack, and that can do an entire ChaCha20 permutation.  The one
provided uses NEON on the permute operation, with a fallback to the
syscall for chips that do not support AdvSIMD.

This also passes the vdso_test_chacha test along with
vdso_test_getrandom. The vdso_test_getrandom bench-single result on
Neoverse-N1 shows:

   vdso: 25000000 times in 0.783884250 seconds
   libc: 25000000 times in 8.780275399 seconds
syscall: 25000000 times in 8.786581518 seconds

A small fixup to arch/arm64/include/asm/mman.h was required to avoid
pulling kernel code into the vDSO, similar to what's already done in
arch/arm64/include/asm/rwonce.h.

Signed-off-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/arm64/Kconfig                         |   1 +
 arch/arm64/include/asm/mman.h              |   6 +-
 arch/arm64/include/asm/vdso/getrandom.h    |  50 +++++++++
 arch/arm64/include/asm/vdso/vsyscall.h     |  15 +++
 arch/arm64/kernel/vdso.c                   |   6 -
 arch/arm64/kernel/vdso/Makefile            |  25 +++--
 arch/arm64/kernel/vdso/vdso.lds.S          |   4 +
 arch/arm64/kernel/vdso/vgetrandom-chacha.S | 172 +++++++++++++++++++++++++++++
 arch/arm64/kernel/vdso/vgetrandom.c        |  15 +++
 tools/arch/arm64/vdso                      |   1 +
 tools/include/linux/compiler.h             |   4 +
 tools/testing/selftests/vDSO/Makefile      |   3 +-
 12 files changed, 286 insertions(+), 16 deletions(-)
 create mode 100644 arch/arm64/include/asm/vdso/getrandom.h
 create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S
 create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c
 create mode 120000 tools/arch/arm64/vdso

(limited to 'tools')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2f8ff354ca6..7f7424d1b3b8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -262,6 +262,7 @@ config ARM64
 	select TRACE_IRQFLAGS_NMI_SUPPORT
 	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select USER_STACKTRACE_SUPPORT
+	select VDSO_GETRANDOM
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 5966ee4a6154..ceb4370a69c5 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -2,9 +2,11 @@
 #ifndef __ASM_MMAN_H__
 #define __ASM_MMAN_H__
 
+#include <uapi/asm/mman.h>
+
+#ifndef BUILD_VDSO
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <uapi/asm/mman.h>
 
 static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
 	unsigned long pkey __always_unused)
@@ -60,4 +62,6 @@ static inline bool arch_validate_flags(unsigned long vm_flags)
 }
 #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
 
+#endif /* !BUILD_VDSO */
+
 #endif /* ! __ASM_MMAN_H__ */
diff --git a/arch/arm64/include/asm/vdso/getrandom.h b/arch/arm64/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..342f807e2044
--- /dev/null
+++ b/arch/arm64/include/asm/vdso/getrandom.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/unistd.h>
+#include <asm/vdso/vsyscall.h>
+#include <vdso/datapage.h>
+
+/**
+ * getrandom_syscall - Invoke the getrandom() syscall.
+ * @buffer:	Destination buffer to fill with random bytes.
+ * @len:	Size of @buffer in bytes.
+ * @flags:	Zero or more GRND_* flags.
+ * Returns:	The number of random bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
+{
+	register void *buffer asm ("x0") = _buffer;
+	register size_t len asm ("x1") = _len;
+	register unsigned int flags asm ("x2") = _flags;
+	register long ret asm ("x0");
+	register long nr asm ("x8") = __NR_getrandom;
+
+	asm volatile(
+	"       svc #0\n"
+	: "=r" (ret)
+	: "r" (buffer), "r" (len), "r" (flags), "r" (nr)
+	: "memory");
+
+	return ret;
+}
+
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+	/*
+	 * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace
+	 * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_
+	 * PAGE_OFFSET points to the real VVAR page.
+	 */
+	if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
+		return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * (1UL << CONFIG_PAGE_SHIFT);
+	return &_vdso_rng_data;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h
index f94b1457c117..5b6d0dd3cef5 100644
--- a/arch/arm64/include/asm/vdso/vsyscall.h
+++ b/arch/arm64/include/asm/vdso/vsyscall.h
@@ -2,11 +2,19 @@
 #ifndef __ASM_VDSO_VSYSCALL_H
 #define __ASM_VDSO_VSYSCALL_H
 
+#define __VDSO_RND_DATA_OFFSET  480
+
 #ifndef __ASSEMBLY__
 
 #include <linux/timekeeper_internal.h>
 #include <vdso/datapage.h>
 
+enum vvar_pages {
+	VVAR_DATA_PAGE_OFFSET,
+	VVAR_TIMENS_PAGE_OFFSET,
+	VVAR_NR_PAGES,
+};
+
 #define VDSO_PRECISION_MASK	~(0xFF00ULL<<48)
 
 extern struct vdso_data *vdso_data;
@@ -21,6 +29,13 @@ struct vdso_data *__arm64_get_k_vdso_data(void)
 }
 #define __arch_get_k_vdso_data __arm64_get_k_vdso_data
 
+static __always_inline
+struct vdso_rng_data *__arm64_get_k_vdso_rnd_data(void)
+{
+	return (void *)vdso_data + __VDSO_RND_DATA_OFFSET;
+}
+#define __arch_get_k_vdso_rng_data __arm64_get_k_vdso_rnd_data
+
 static __always_inline
 void __arm64_update_vsyscall(struct vdso_data *vdata, struct timekeeper *tk)
 {
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 89b6e7840002..706c9c3a7a50 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -34,12 +34,6 @@ enum vdso_abi {
 	VDSO_ABI_AA32,
 };
 
-enum vvar_pages {
-	VVAR_DATA_PAGE_OFFSET,
-	VVAR_TIMENS_PAGE_OFFSET,
-	VVAR_NR_PAGES,
-};
-
 struct vdso_abi_info {
 	const char *name;
 	const char *vdso_code_start;
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index d11da6461278..35685c036044 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -9,7 +9,7 @@
 # Include the generic Makefile to check the built vdso.
 include $(srctree)/lib/vdso/Makefile
 
-obj-vdso := vgettimeofday.o note.o sigreturn.o
+obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o
 
 # Build rules
 targets := $(obj-vdso) vdso.so vdso.so.dbg
@@ -34,19 +34,28 @@ ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18
 ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 
 # -Wmissing-prototypes and -Wmissing-declarations are removed from
-# the CFLAGS of vgettimeofday.c to make possible to build the
-# kernel with CONFIG_WERROR enabled.
-CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
-				$(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
-				$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
-				-Wmissing-prototypes -Wmissing-declarations
+# the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled.
+CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
+			$(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
+			$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
+			-Wmissing-prototypes -Wmissing-declarations
 
-CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables
+CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables
+
+CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_REMOVE_VDSO)
+CFLAGS_REMOVE_vgetrandom.o = $(CC_FLAGS_REMOVE_VDSO)
+
+CFLAGS_vgettimeofday.o = $(CC_FLAGS_ADD_VDSO)
+CFLAGS_vgetrandom.o = $(CC_FLAGS_ADD_VDSO)
 
 ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
 endif
 
+ifneq ($(c-getrandom-y),)
+  CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
+endif
+
 targets += vdso.lds
 CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
 
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index 45354f2ddf70..f204a9ddc833 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -11,7 +11,9 @@
 #include <linux/const.h>
 #include <asm/page.h>
 #include <asm/vdso.h>
+#include <asm/vdso/vsyscall.h>
 #include <asm-generic/vmlinux.lds.h>
+#include <vdso/datapage.h>
 
 OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64")
 OUTPUT_ARCH(aarch64)
@@ -19,6 +21,7 @@ OUTPUT_ARCH(aarch64)
 SECTIONS
 {
 	PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+	PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET);
 #ifdef CONFIG_TIME_NS
 	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
 #endif
@@ -102,6 +105,7 @@ VERSION
 		__kernel_gettimeofday;
 		__kernel_clock_gettime;
 		__kernel_clock_getres;
+		__kernel_getrandom;
 	local: *;
 	};
 }
diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..67890b445309
--- /dev/null
+++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+#include <asm/assembler.h>
+
+	.text
+
+#define state0		v0
+#define state1		v1
+#define state2		v2
+#define state3		v3
+#define copy0		v4
+#define copy0_q		q4
+#define copy1		v5
+#define copy2		v6
+#define copy3		v7
+#define copy3_d		d7
+#define one_d		d16
+#define one_q		q16
+#define one_v		v16
+#define tmp		v17
+#define rot8		v18
+
+/*
+ * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
+ * number of blocks of output with nonce 0, taking an input key and 8-bytes
+ * counter.  Importantly does not spill to the stack.
+ *
+ * This implementation avoids d8-d15 because they are callee-save in user
+ * space.
+ *
+ * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
+ *				       const uint8_t *key,
+ * 				       uint32_t *counter,
+ *				       size_t nblocks)
+ *
+ * 	x0: output bytes
+ *	x1: 32-byte key input
+ *	x2: 8-byte counter input/output
+ *	x3: number of 64-byte block to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+	/* copy0 = "expand 32-byte k" */
+	mov_q		x8, 0x3320646e61707865
+	mov_q		x9, 0x6b20657479622d32
+	mov		copy0.d[0], x8
+	mov		copy0.d[1], x9
+
+	/* copy1,copy2 = key */
+	ld1		{ copy1.4s, copy2.4s }, [x1]
+	/* copy3 = counter || zero nonce  */
+	ld1		{ copy3.2s }, [x2]
+
+	movi		one_v.2s, #1
+	uzp1		one_v.4s, one_v.4s, one_v.4s
+
+.Lblock:
+	/* copy state to auxiliary vectors for the final add after the permute.  */
+	mov		state0.16b, copy0.16b
+	mov		state1.16b, copy1.16b
+	mov		state2.16b, copy2.16b
+	mov		state3.16b, copy3.16b
+
+	mov		w4, 20
+.Lpermute:
+	/*
+	 * Permute one 64-byte block where the state matrix is stored in the four NEON
+	 * registers state0-state3.  It performs matrix operations on four words in parallel,
+	 * but requires shuffling to rearrange the words after each round.
+	 */
+
+.Ldoubleround:
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+	add		state0.4s, state0.4s, state1.4s
+	eor		state3.16b, state3.16b, state0.16b
+	rev32		state3.8h, state3.8h
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+	add		state2.4s, state2.4s, state3.4s
+	eor		tmp.16b, state1.16b, state2.16b
+	shl		state1.4s, tmp.4s, #12
+	sri		state1.4s, tmp.4s, #20
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+	add		state0.4s, state0.4s, state1.4s
+	eor		tmp.16b, state3.16b, state0.16b
+	shl		state3.4s, tmp.4s, #8
+	sri		state3.4s, tmp.4s, #24
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+	add		state2.4s, state2.4s, state3.4s
+	eor		tmp.16b, state1.16b, state2.16b
+	shl		state1.4s, tmp.4s, #7
+	sri		state1.4s, tmp.4s, #25
+
+	/* state1[0,1,2,3] = state1[1,2,3,0] */
+	ext		state1.16b, state1.16b, state1.16b, #4
+	/* state2[0,1,2,3] = state2[2,3,0,1] */
+	ext		state2.16b, state2.16b, state2.16b, #8
+	/* state3[0,1,2,3] = state3[1,2,3,0] */
+	ext		state3.16b, state3.16b, state3.16b, #12
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+	add		state0.4s, state0.4s, state1.4s
+	eor		state3.16b, state3.16b, state0.16b
+	rev32		state3.8h, state3.8h
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+	add		state2.4s, state2.4s, state3.4s
+	eor		tmp.16b, state1.16b, state2.16b
+	shl		state1.4s, tmp.4s, #12
+	sri		state1.4s, tmp.4s, #20
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+	add		state0.4s, state0.4s, state1.4s
+	eor		tmp.16b, state3.16b, state0.16b
+	shl		state3.4s, tmp.4s, #8
+	sri		state3.4s, tmp.4s, #24
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+	add		state2.4s, state2.4s, state3.4s
+	eor		tmp.16b, state1.16b, state2.16b
+	shl		state1.4s, tmp.4s, #7
+	sri		state1.4s, tmp.4s, #25
+
+	/* state1[0,1,2,3] = state1[3,0,1,2] */
+	ext		state1.16b, state1.16b, state1.16b, #12
+	/* state2[0,1,2,3] = state2[2,3,0,1] */
+	ext		state2.16b, state2.16b, state2.16b, #8
+	/* state3[0,1,2,3] = state3[1,2,3,0] */
+	ext		state3.16b, state3.16b, state3.16b, #4
+
+	subs		w4, w4, #2
+	b.ne		.Ldoubleround
+
+	/* output0 = state0 + state0 */
+	add		state0.4s, state0.4s, copy0.4s
+	/* output1 = state1 + state1 */
+	add		state1.4s, state1.4s, copy1.4s
+	/* output2 = state2 + state2 */
+	add		state2.4s, state2.4s, copy2.4s
+	/* output2 = state3 + state3 */
+	add		state3.4s, state3.4s, copy3.4s
+	st1		{ state0.16b - state3.16b }, [x0]
+
+	/*
+	 * ++copy3.counter, the 'add' clears the upper half of the SIMD register
+	 * which is the expected behaviour here.
+	 */
+	add		copy3_d, copy3_d, one_d
+
+	/* output += 64, --nblocks */
+	add		x0, x0, 64
+	subs		x3, x3, #1
+	b.ne		.Lblock
+
+	/* counter = copy3.counter */
+	st1		{ copy3.2s }, [x2]
+
+	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
+	movi		state0.16b, #0
+	movi		state1.16b, #0
+	movi		state2.16b, #0
+	movi		state3.16b, #0
+	movi		copy1.16b, #0
+	movi		copy2.16b, #0
+	ret
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
+
+emit_aarch64_feature_1_and
diff --git a/arch/arm64/kernel/vdso/vgetrandom.c b/arch/arm64/kernel/vdso/vgetrandom.c
new file mode 100644
index 000000000000..832fe195292b
--- /dev/null
+++ b/arch/arm64/kernel/vdso/vgetrandom.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <uapi/asm-generic/errno.h>
+
+typeof(__cvdso_getrandom) __kernel_getrandom;
+
+ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+	if (alternative_has_cap_likely(ARM64_HAS_FPSIMD))
+		return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+
+	if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags))
+		return -ENOSYS;
+	return getrandom_syscall(buffer, len, flags);
+}
diff --git a/tools/arch/arm64/vdso b/tools/arch/arm64/vdso
new file mode 120000
index 000000000000..233c7a26f6e5
--- /dev/null
+++ b/tools/arch/arm64/vdso
@@ -0,0 +1 @@
+../../../arch/arm64/kernel/vdso
\ No newline at end of file
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 6f7f22ac9da5..4366da278033 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -2,6 +2,8 @@
 #ifndef _TOOLS_LINUX_COMPILER_H_
 #define _TOOLS_LINUX_COMPILER_H_
 
+#ifndef __ASSEMBLY__
+
 #include <linux/compiler_types.h>
 
 #ifndef __compiletime_error
@@ -224,4 +226,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	__asm__ ("" : "=r" (var) : "0" (var))
 #endif
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _TOOLS_LINUX_COMPILER_H */
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 04930125035e..3c6fafbd83a6 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -9,7 +9,7 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
 TEST_GEN_PROGS += vdso_standalone_test_x86
 endif
 TEST_GEN_PROGS += vdso_test_correctness
-ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch))
+ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch arm64))
 TEST_GEN_PROGS += vdso_test_getrandom
 TEST_GEN_PROGS += vdso_test_chacha
 endif
@@ -40,5 +40,6 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
 $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(SRCARCH)/vdso/vgetrandom-chacha.S
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
                                       -idirafter $(top_srcdir)/arch/$(SRCARCH)/include \
+                                      -idirafter $(top_srcdir)/arch/$(SRCARCH)/include/generated \
                                       -idirafter $(top_srcdir)/include \
                                       -D__ASSEMBLY__ -Wa,--noexecstack
-- 
cgit v1.2.3


From 8bc7c5e525584903ea83332e18a2118ed3b1985e Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 3 Sep 2024 14:52:45 +0200
Subject: selftests: vDSO: don't include generated headers for chacha test

It's not correct to use $(top_srcdir) for generated header files, for
builds that are done out of tree via O=, and $(objtree) isn't valid in
the selftests context. Instead, just obviate the need for these
generated header files by defining empty stubs in tools/include, which
is the same thing that's done for rwlock.h.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/include/generated/asm-offsets.h     | 0
 tools/include/generated/asm/cpucap-defs.h | 0
 tools/include/generated/asm/sysreg-defs.h | 0
 tools/testing/selftests/vDSO/Makefile     | 2 +-
 4 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 tools/include/generated/asm-offsets.h
 create mode 100644 tools/include/generated/asm/cpucap-defs.h
 create mode 100644 tools/include/generated/asm/sysreg-defs.h

(limited to 'tools')

diff --git a/tools/include/generated/asm-offsets.h b/tools/include/generated/asm-offsets.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/include/generated/asm/cpucap-defs.h b/tools/include/generated/asm/cpucap-defs.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/include/generated/asm/sysreg-defs.h b/tools/include/generated/asm/sysreg-defs.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 3c6fafbd83a6..610056acc1b7 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -39,7 +39,7 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
 
 $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(SRCARCH)/vdso/vgetrandom-chacha.S
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
+                                      -idirafter $(top_srcdir)/tools/include/generated \
                                       -idirafter $(top_srcdir)/arch/$(SRCARCH)/include \
-                                      -idirafter $(top_srcdir)/arch/$(SRCARCH)/include/generated \
                                       -idirafter $(top_srcdir)/include \
                                       -D__ASSEMBLY__ -Wa,--noexecstack
-- 
cgit v1.2.3


From 53cee505ae0c4d19a2c894501a363782410c4a29 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 2 Sep 2024 21:17:21 +0200
Subject: powerpc/vdso: Wire up getrandom() vDSO implementation on VDSO32

To be consistent with other VDSO functions, the function is called
__kernel_getrandom()

__arch_chacha20_blocks_nostack() fonction is implemented basically
with 32 bits operations. It performs 4 QUARTERROUND operations in
parallele. There are enough registers to avoid using the stack:

On input:
	r3: output bytes
	r4: 32-byte key input
	r5: 8-byte counter input/output
	r6: number of 64-byte blocks to write to output

During operation:
	stack: pointer to counter (r5) and non-volatile registers (r14-131)
	r0: counter of blocks (initialised with r6)
	r4: Value '4' after key has been read, used for indexing
	r5-r12: key
	r14-r15: block counter
	r16-r31: chacha state

At the end:
	r0, r6-r12: Zeroised
	r5, r14-r31: Restored

Performance on powerpc 885 (using kernel selftest):
	~# ./vdso_test_getrandom bench-single
	   vdso: 25000000 times in 62.938002291 seconds
	   libc: 25000000 times in 535.581916866 seconds
	syscall: 25000000 times in 531.525042806 seconds

Performance on powerpc 8321 (using kernel selftest):
	~# ./vdso_test_getrandom bench-single
	   vdso: 25000000 times in 16.899318858 seconds
	   libc: 25000000 times in 131.050596522 seconds
	syscall: 25000000 times in 129.794790389 seconds

This first patch adds support for VDSO32. As selftests cannot easily
be generated only for VDSO32, and because the following patch brings
support for VDSO64 anyway, this patch opts out all code in
__arch_chacha20_blocks_nostack() so that vdso_test_chacha will not
fail to compile and will not crash on PPC64/PPC64LE, allthough the
selftest itself will fail.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/powerpc/Kconfig                         |   1 +
 arch/powerpc/include/asm/mman.h              |   2 +-
 arch/powerpc/include/asm/vdso/getrandom.h    |  54 +++++
 arch/powerpc/include/asm/vdso/vsyscall.h     |   6 +
 arch/powerpc/include/asm/vdso_datapage.h     |   2 +
 arch/powerpc/kernel/asm-offsets.c            |   1 +
 arch/powerpc/kernel/vdso/Makefile            |  14 +-
 arch/powerpc/kernel/vdso/getrandom.S         |  50 +++++
 arch/powerpc/kernel/vdso/vdso32.lds.S        |   1 +
 arch/powerpc/kernel/vdso/vgetrandom-chacha.S | 312 +++++++++++++++++++++++++++
 arch/powerpc/kernel/vdso/vgetrandom.c        |  14 ++
 tools/arch/powerpc/vdso                      |   1 +
 tools/testing/selftests/vDSO/Makefile        |   2 +-
 13 files changed, 455 insertions(+), 5 deletions(-)
 create mode 100644 arch/powerpc/include/asm/vdso/getrandom.h
 create mode 100644 arch/powerpc/kernel/vdso/getrandom.S
 create mode 100644 arch/powerpc/kernel/vdso/vgetrandom-chacha.S
 create mode 100644 arch/powerpc/kernel/vdso/vgetrandom.c
 create mode 120000 tools/arch/powerpc/vdso

(limited to 'tools')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d7b09b064a8a..e500a59ddecc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -311,6 +311,7 @@ config PPC
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
 	select TRACE_IRQFLAGS_SUPPORT
+	select VDSO_GETRANDOM			if VDSO32
 	#
 	# Please keep this list sorted alphabetically.
 	#
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index 17a77d47ed6d..42a51a993d94 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -6,7 +6,7 @@
 
 #include <uapi/asm/mman.h>
 
-#ifdef CONFIG_PPC64
+#if defined(CONFIG_PPC64) && !defined(BUILD_VDSO)
 
 #include <asm/cputable.h>
 #include <linux/mm.h>
diff --git a/arch/powerpc/include/asm/vdso/getrandom.h b/arch/powerpc/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..501d6bb14e8a
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/getrandom.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France
+ */
+#ifndef _ASM_POWERPC_VDSO_GETRANDOM_H
+#define _ASM_POWERPC_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+static __always_inline int do_syscall_3(const unsigned long _r0, const unsigned long _r3,
+					const unsigned long _r4, const unsigned long _r5)
+{
+	register long r0 asm("r0") = _r0;
+	register unsigned long r3 asm("r3") = _r3;
+	register unsigned long r4 asm("r4") = _r4;
+	register unsigned long r5 asm("r5") = _r5;
+	register int ret asm ("r3");
+
+	asm volatile(
+		"       sc\n"
+		"	bns+	1f\n"
+		"	neg	%0, %0\n"
+		"1:\n"
+	: "=r" (ret), "+r" (r4), "+r" (r5), "+r" (r0)
+	: "r" (r3)
+	: "memory", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cr0", "ctr");
+
+	return ret;
+}
+
+/**
+ * getrandom_syscall - Invoke the getrandom() syscall.
+ * @buffer:	Destination buffer to fill with random bytes.
+ * @len:	Size of @buffer in bytes.
+ * @flags:	Zero or more GRND_* flags.
+ * Returns:	The number of bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
+{
+	return do_syscall_3(__NR_getrandom, (unsigned long)buffer,
+			    (unsigned long)len, (unsigned long)flags);
+}
+
+static __always_inline struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+	return NULL;
+}
+
+ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state,
+			     size_t opaque_len, const struct vdso_rng_data *vd);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_VDSO_GETRANDOM_H */
diff --git a/arch/powerpc/include/asm/vdso/vsyscall.h b/arch/powerpc/include/asm/vdso/vsyscall.h
index 48cf23f1e273..92f480d8cc6d 100644
--- a/arch/powerpc/include/asm/vdso/vsyscall.h
+++ b/arch/powerpc/include/asm/vdso/vsyscall.h
@@ -17,6 +17,12 @@ struct vdso_data *__arch_get_k_vdso_data(void)
 }
 #define __arch_get_k_vdso_data __arch_get_k_vdso_data
 
+static __always_inline
+struct vdso_rng_data *__arch_get_k_vdso_rng_data(void)
+{
+	return &vdso_data->rng_data;
+}
+
 /* The asm-generic header needs to be included after the definitions above */
 #include <asm-generic/vdso/vsyscall.h>
 
diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index 939daf6b695e..248dee138f7b 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -83,6 +83,7 @@ struct vdso_arch_data {
 	__u32 compat_syscall_map[SYSCALL_MAP_SIZE];	/* Map of compat syscalls */
 
 	struct vdso_data data[CS_BASES];
+	struct vdso_rng_data rng_data;
 };
 
 #else /* CONFIG_PPC64 */
@@ -95,6 +96,7 @@ struct vdso_arch_data {
 	__u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */
 	__u32 compat_syscall_map[0];	/* No compat syscalls on PPC32 */
 	struct vdso_data data[CS_BASES];
+	struct vdso_rng_data rng_data;
 };
 
 #endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7b3feb6bc210..131a8cc10dbe 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -335,6 +335,7 @@ int main(void)
 
 	/* datapage offsets for use by vdso */
 	OFFSET(VDSO_DATA_OFFSET, vdso_arch_data, data);
+	OFFSET(VDSO_RNG_DATA_OFFSET, vdso_arch_data, rng_data);
 	OFFSET(CFG_TB_TICKS_PER_SEC, vdso_arch_data, tb_ticks_per_sec);
 #ifdef CONFIG_PPC64
 	OFFSET(CFG_ICACHE_BLOCKSZ, vdso_arch_data, icache_block_size);
diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile
index 67fe79d26fae..7a4a935406d8 100644
--- a/arch/powerpc/kernel/vdso/Makefile
+++ b/arch/powerpc/kernel/vdso/Makefile
@@ -8,6 +8,8 @@ include $(srctree)/lib/vdso/Makefile
 obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o
 obj-vdso64 = sigtramp64-64.o gettimeofday-64.o datapage-64.o cacheflush-64.o note-64.o getcpu-64.o
 
+obj-vdso32 += getrandom-32.o vgetrandom-chacha-32.o
+
 ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday-32.o += -include $(c-gettimeofday-y)
 # Go prior to 1.16.x assumes r30 is not clobbered by any VDSO code. That used to be true
@@ -17,6 +19,10 @@ ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday-64.o += -include $(c-gettimeofday-y) $(call cc-option, -ffixed-r30)
 endif
 
+ifneq ($(c-getrandom-y),)
+  CFLAGS_vgetrandom-32.o += -include $(c-getrandom-y)
+endif
+
 # Build rules
 
 ifdef CROSS32_COMPILE
@@ -25,13 +31,13 @@ else
     VDSOCC := $(CC)
 endif
 
-targets := $(obj-vdso32) vdso32.so.dbg vgettimeofday-32.o
+targets := $(obj-vdso32) vdso32.so.dbg vgettimeofday-32.o vgetrandom-32.o
 targets += crtsavres-32.o
 obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
 targets += $(obj-vdso64) vdso64.so.dbg vgettimeofday-64.o
 obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
 
-ccflags-y := -fno-common -fno-builtin
+ccflags-y := -fno-common -fno-builtin -DBUILD_VDSO
 ccflags-y += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 ccflags-y += $(call cc-option, -fno-stack-protector)
 ccflags-y += -DDISABLE_BRANCH_PROFILING
@@ -63,7 +69,7 @@ targets += vdso64.lds
 CPPFLAGS_vdso64.lds += -P -C
 
 # link rule for the .so file, .lds has to be first
-$(obj)/vdso32.so.dbg: $(obj)/vdso32.lds $(obj-vdso32) $(obj)/vgettimeofday-32.o $(obj)/crtsavres-32.o FORCE
+$(obj)/vdso32.so.dbg: $(obj)/vdso32.lds $(obj-vdso32) $(obj)/vgettimeofday-32.o $(obj)/vgetrandom-32.o $(obj)/crtsavres-32.o FORCE
 	$(call if_changed,vdso32ld_and_check)
 $(obj)/vdso64.so.dbg: $(obj)/vdso64.lds $(obj-vdso64) $(obj)/vgettimeofday-64.o FORCE
 	$(call if_changed,vdso64ld_and_check)
@@ -75,6 +81,8 @@ $(obj)/crtsavres-32.o: %-32.o: $(srctree)/arch/powerpc/lib/crtsavres.S FORCE
 	$(call if_changed_dep,vdso32as)
 $(obj)/vgettimeofday-32.o: %-32.o: %.c FORCE
 	$(call if_changed_dep,vdso32cc)
+$(obj)/vgetrandom-32.o: %-32.o: %.c FORCE
+	$(call if_changed_dep,vdso32cc)
 $(obj-vdso64): %-64.o: %.S FORCE
 	$(call if_changed_dep,vdso64as)
 $(obj)/vgettimeofday-64.o: %-64.o: %.c FORCE
diff --git a/arch/powerpc/kernel/vdso/getrandom.S b/arch/powerpc/kernel/vdso/getrandom.S
new file mode 100644
index 000000000000..f614bb90ab57
--- /dev/null
+++ b/arch/powerpc/kernel/vdso/getrandom.S
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Userland implementation of getrandom() for processes
+ * for use in the vDSO
+ *
+ * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/vdso_datapage.h>
+#include <asm/asm-offsets.h>
+#include <asm/unistd.h>
+
+/*
+ * The macro sets two stack frames, one for the caller and one for the callee
+ * because there are no requirement for the caller to set a stack frame when
+ * calling VDSO so it may have omitted to set one, especially on PPC64
+ */
+
+.macro cvdso_call funct
+  .cfi_startproc
+	PPC_STLU	r1, -PPC_MIN_STKFRM(r1)
+  .cfi_adjust_cfa_offset PPC_MIN_STKFRM
+	mflr		r0
+	PPC_STLU	r1, -PPC_MIN_STKFRM(r1)
+  .cfi_adjust_cfa_offset PPC_MIN_STKFRM
+	PPC_STL		r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+  .cfi_rel_offset lr, PPC_MIN_STKFRM + PPC_LR_STKOFF
+	get_realdatapage	r8, r11
+	addi		r8, r8, VDSO_RNG_DATA_OFFSET
+	bl		CFUNC(DOTSYM(\funct))
+	PPC_LL		r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+	cmpwi		r3, 0
+	mtlr		r0
+	addi		r1, r1, 2 * PPC_MIN_STKFRM
+  .cfi_restore lr
+  .cfi_def_cfa_offset 0
+	crclr		so
+	bgelr+
+	crset		so
+	neg		r3, r3
+	blr
+  .cfi_endproc
+.endm
+
+	.text
+V_FUNCTION_BEGIN(__kernel_getrandom)
+	cvdso_call __c_kernel_getrandom
+V_FUNCTION_END(__kernel_getrandom)
diff --git a/arch/powerpc/kernel/vdso/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S
index 426e1ccc6971..71f086a015df 100644
--- a/arch/powerpc/kernel/vdso/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso/vdso32.lds.S
@@ -128,6 +128,7 @@ VERSION
 #if defined(CONFIG_PPC64) || !defined(CONFIG_SMP)
 		__kernel_getcpu;
 #endif
+		__kernel_getrandom;
 
 	local: *;
 	};
diff --git a/arch/powerpc/kernel/vdso/vgetrandom-chacha.S b/arch/powerpc/kernel/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..ac85788205cb
--- /dev/null
+++ b/arch/powerpc/kernel/vdso/vgetrandom-chacha.S
@@ -0,0 +1,312 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/ppc_asm.h>
+
+#define	dst_bytes	r3
+#define	key		r4
+#define	counter		r5
+#define	nblocks		r6
+
+#define	idx_r0		r0
+#define	val4		r4
+
+#define	const0		0x61707865
+#define	const1		0x3320646e
+#define	const2		0x79622d32
+#define	const3		0x6b206574
+
+#define	key0		r5
+#define	key1		r6
+#define	key2		r7
+#define	key3		r8
+#define	key4		r9
+#define	key5		r10
+#define	key6		r11
+#define	key7		r12
+
+#define	counter0	r14
+#define	counter1	r15
+
+#define	state0		r16
+#define	state1		r17
+#define	state2		r18
+#define	state3		r19
+#define	state4		r20
+#define	state5		r21
+#define	state6		r22
+#define	state7		r23
+#define	state8		r24
+#define	state9		r25
+#define	state10		r26
+#define	state11		r27
+#define	state12		r28
+#define	state13		r29
+#define	state14		r30
+#define	state15		r31
+
+.macro quarterround4 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4
+	add	\a1, \a1, \b1
+	add	\a2, \a2, \b2
+	add	\a3, \a3, \b3
+	add	\a4, \a4, \b4
+	xor	\d1, \d1, \a1
+	xor	\d2, \d2, \a2
+	xor	\d3, \d3, \a3
+	xor	\d4, \d4, \a4
+	rotlwi	\d1, \d1, 16
+	rotlwi	\d2, \d2, 16
+	rotlwi	\d3, \d3, 16
+	rotlwi	\d4, \d4, 16
+	add	\c1, \c1, \d1
+	add	\c2, \c2, \d2
+	add	\c3, \c3, \d3
+	add	\c4, \c4, \d4
+	xor	\b1, \b1, \c1
+	xor	\b2, \b2, \c2
+	xor	\b3, \b3, \c3
+	xor	\b4, \b4, \c4
+	rotlwi	\b1, \b1, 12
+	rotlwi	\b2, \b2, 12
+	rotlwi	\b3, \b3, 12
+	rotlwi	\b4, \b4, 12
+	add	\a1, \a1, \b1
+	add	\a2, \a2, \b2
+	add	\a3, \a3, \b3
+	add	\a4, \a4, \b4
+	xor	\d1, \d1, \a1
+	xor	\d2, \d2, \a2
+	xor	\d3, \d3, \a3
+	xor	\d4, \d4, \a4
+	rotlwi	\d1, \d1, 8
+	rotlwi	\d2, \d2, 8
+	rotlwi	\d3, \d3, 8
+	rotlwi	\d4, \d4, 8
+	add	\c1, \c1, \d1
+	add	\c2, \c2, \d2
+	add	\c3, \c3, \d3
+	add	\c4, \c4, \d4
+	xor	\b1, \b1, \c1
+	xor	\b2, \b2, \c2
+	xor	\b3, \b3, \c3
+	xor	\b4, \b4, \c4
+	rotlwi	\b1, \b1, 7
+	rotlwi	\b2, \b2, 7
+	rotlwi	\b3, \b3, 7
+	rotlwi	\b4, \b4, 7
+.endm
+
+#define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4) \
+	quarterround4 state##a1 state##b1 state##c1 state##d1 \
+		      state##a2 state##b2 state##c2 state##d2 \
+		      state##a3 state##b3 state##c3 state##d3 \
+		      state##a4 state##b4 state##c4 state##d4
+
+/*
+ * Very basic 32 bits implementation of ChaCha20. Produces a given positive number
+ * of blocks of output with a nonce of 0, taking an input key and 8-byte
+ * counter. Importantly does not spill to the stack. Its arguments are:
+ *
+ *	r3: output bytes
+ *	r4: 32-byte key input
+ *	r5: 8-byte counter input/output (saved on stack)
+ *	r6: number of 64-byte blocks to write to output
+ *
+ *	r0: counter of blocks (initialised with r6)
+ *	r4: Value '4' after key has been read.
+ *	r5-r12: key
+ *	r14-r15: counter
+ *	r16-r31: state
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+#ifdef __powerpc64__
+#else
+	stwu	r1, -96(r1)
+	stw	counter, 20(r1)
+#ifdef __BIG_ENDIAN__
+	stmw	r14, 24(r1)
+#else
+	stw	r14, 24(r1)
+	stw	r15, 28(r1)
+	stw	r16, 32(r1)
+	stw	r17, 36(r1)
+	stw	r18, 40(r1)
+	stw	r19, 44(r1)
+	stw	r20, 48(r1)
+	stw	r21, 52(r1)
+	stw	r22, 56(r1)
+	stw	r23, 60(r1)
+	stw	r24, 64(r1)
+	stw	r25, 68(r1)
+	stw	r26, 72(r1)
+	stw	r27, 76(r1)
+	stw	r28, 80(r1)
+	stw	r29, 84(r1)
+	stw	r30, 88(r1)
+	stw	r31, 92(r1)
+#endif
+
+	lwz	counter0, 0(counter)
+	lwz	counter1, 4(counter)
+	mr	idx_r0, nblocks
+	subi	dst_bytes, dst_bytes, 4
+
+	lwz	key0, 0(key)
+	lwz	key1, 4(key)
+	lwz	key2, 8(key)
+	lwz	key3, 12(key)
+	lwz	key4, 16(key)
+	lwz	key5, 20(key)
+	lwz	key6, 24(key)
+	lwz	key7, 28(key)
+
+	li	val4, 4
+.Lblock:
+	li	r31, 10
+
+	lis	state0, const0@ha
+	lis	state1, const1@ha
+	lis	state2, const2@ha
+	lis	state3, const3@ha
+	addi	state0, state0, const0@l
+	addi	state1, state1, const1@l
+	addi	state2, state2, const2@l
+	addi	state3, state3, const3@l
+
+	mtctr	r31
+
+	mr	state4, key0
+	mr	state5, key1
+	mr	state6, key2
+	mr	state7, key3
+	mr	state8, key4
+	mr	state9, key5
+	mr	state10, key6
+	mr	state11, key7
+
+	mr	state12, counter0
+	mr	state13, counter1
+
+	li	state14, 0
+	li	state15, 0
+
+.Lpermute:
+	QUARTERROUND4( 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15)
+	QUARTERROUND4( 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14)
+
+	bdnz	.Lpermute
+
+	addis	state0, state0, const0@ha
+	addis	state1, state1, const1@ha
+	addis	state2, state2, const2@ha
+	addis	state3, state3, const3@ha
+	addi	state0, state0, const0@l
+	addi	state1, state1, const1@l
+	addi	state2, state2, const2@l
+	addi	state3, state3, const3@l
+
+	add	state4, state4, key0
+	add	state5, state5, key1
+	add	state6, state6, key2
+	add	state7, state7, key3
+	add	state8, state8, key4
+	add	state9, state9, key5
+	add	state10, state10, key6
+	add	state11, state11, key7
+
+	add	state12, state12, counter0
+	add	state13, state13, counter1
+
+#ifdef __BIG_ENDIAN__
+	stwbrx	state0, val4, dst_bytes
+	addi	dst_bytes, dst_bytes, 8
+	stwbrx	state1, 0, dst_bytes
+	stwbrx	state2, val4, dst_bytes
+	addi	dst_bytes, dst_bytes, 8
+	stwbrx	state3, 0, dst_bytes
+	stwbrx	state4, val4, dst_bytes
+	addi	dst_bytes, dst_bytes, 8
+	stwbrx	state5, 0, dst_bytes
+	stwbrx	state6, val4, dst_bytes
+	addi	dst_bytes, dst_bytes, 8
+	stwbrx	state7, 0, dst_bytes
+	stwbrx	state8, val4, dst_bytes
+	addi	dst_bytes, dst_bytes, 8
+	stwbrx	state9, 0, dst_bytes
+	stwbrx	state10, val4, dst_bytes
+	addi	dst_bytes, dst_bytes, 8
+	stwbrx	state11, 0, dst_bytes
+	stwbrx	state12, val4, dst_bytes
+	addi	dst_bytes, dst_bytes, 8
+	stwbrx	state13, 0, dst_bytes
+	stwbrx	state14, val4, dst_bytes
+	addi	dst_bytes, dst_bytes, 8
+	stwbrx	state15, 0, dst_bytes
+#else
+	stw	state0, 4(dst_bytes)
+	stw	state1, 8(dst_bytes)
+	stw	state2, 12(dst_bytes)
+	stw	state3, 16(dst_bytes)
+	stw	state4, 20(dst_bytes)
+	stw	state5, 24(dst_bytes)
+	stw	state6, 28(dst_bytes)
+	stw	state7, 32(dst_bytes)
+	stw	state8, 36(dst_bytes)
+	stw	state9, 40(dst_bytes)
+	stw	state10, 44(dst_bytes)
+	stw	state11, 48(dst_bytes)
+	stw	state12, 52(dst_bytes)
+	stw	state13, 56(dst_bytes)
+	stw	state14, 60(dst_bytes)
+	stwu	state15, 64(dst_bytes)
+#endif
+
+	subic.	idx_r0, idx_r0, 1	/* subi. can't use r0 as source */
+
+	addic	counter0, counter0, 1
+	addze	counter1, counter1
+
+	bne	.Lblock
+
+	lwz	counter, 20(r1)
+	stw	counter0, 0(counter)
+	stw	counter1, 4(counter)
+
+	li	r6, 0
+	li	r7, 0
+	li	r8, 0
+	li	r9, 0
+	li	r10, 0
+	li	r11, 0
+	li	r12, 0
+
+#ifdef __BIG_ENDIAN__
+	lmw	r14, 24(r1)
+#else
+	lwz	r14, 24(r1)
+	lwz	r15, 28(r1)
+	lwz	r16, 32(r1)
+	lwz	r17, 36(r1)
+	lwz	r18, 40(r1)
+	lwz	r19, 44(r1)
+	lwz	r20, 48(r1)
+	lwz	r21, 52(r1)
+	lwz	r22, 56(r1)
+	lwz	r23, 60(r1)
+	lwz	r24, 64(r1)
+	lwz	r25, 68(r1)
+	lwz	r26, 72(r1)
+	lwz	r27, 76(r1)
+	lwz	r28, 80(r1)
+	lwz	r29, 84(r1)
+	lwz	r30, 88(r1)
+	lwz	r31, 92(r1)
+#endif
+	addi	r1, r1, 96
+#endif	/* __powerpc64__ */
+	blr
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/powerpc/kernel/vdso/vgetrandom.c b/arch/powerpc/kernel/vdso/vgetrandom.c
new file mode 100644
index 000000000000..5f855d45fb7b
--- /dev/null
+++ b/arch/powerpc/kernel/vdso/vgetrandom.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Powerpc userspace implementation of getrandom()
+ *
+ * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France
+ */
+#include <linux/time.h>
+#include <linux/types.h>
+
+ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state,
+			     size_t opaque_len, const struct vdso_rng_data *vd)
+{
+	return __cvdso_getrandom_data(vd, buffer, len, flags, opaque_state, opaque_len);
+}
diff --git a/tools/arch/powerpc/vdso b/tools/arch/powerpc/vdso
new file mode 120000
index 000000000000..4e676d1d1cb4
--- /dev/null
+++ b/tools/arch/powerpc/vdso
@@ -0,0 +1 @@
+../../../arch/powerpc/kernel/vdso
\ No newline at end of file
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 610056acc1b7..86ebc4115eda 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -9,7 +9,7 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
 TEST_GEN_PROGS += vdso_standalone_test_x86
 endif
 TEST_GEN_PROGS += vdso_test_correctness
-ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch arm64))
+ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch arm64 powerpc))
 TEST_GEN_PROGS += vdso_test_getrandom
 TEST_GEN_PROGS += vdso_test_chacha
 endif
-- 
cgit v1.2.3


From 14be4e6f35221c4731b004553ecf7cbc6dc1d2d8 Mon Sep 17 00:00:00 2001
From: Jens Remus <jremus@linux.ibm.com>
Date: Wed, 11 Sep 2024 10:50:14 +0200
Subject: selftests: vDSO: fix ELF hash table entry size for s390x

The vDSO self tests fail on s390x for a vDSO linked with the GNU linker
ld as follows:

  # ./vdso_test_gettimeofday
  Floating point exception (core dumped)

On s390x the ELF hash table entries are 64 bits instead of 32 bits in
size (see Glibc sysdeps/unix/sysv/linux/s390/bits/elfclass.h).

Fixes: 40723419f407 ("kselftest: Enable vDSO test on non x86 platforms")
Reported-by: Heiko Carstens <hca@linux.ibm.com>
Tested-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/parse_vdso.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c
index d9ccc5acac18..7dd5668ea8a6 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -36,6 +36,12 @@
 #define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
 #define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
 
+#ifdef __s390x__
+#define ELF_HASH_ENTRY ELF(Xword)
+#else
+#define ELF_HASH_ENTRY ELF(Word)
+#endif
+
 static struct vdso_info
 {
 	bool valid;
@@ -47,8 +53,8 @@ static struct vdso_info
 	/* Symbol table */
 	ELF(Sym) *symtab;
 	const char *symstrings;
-	ELF(Word) *bucket, *chain;
-	ELF(Word) nbucket, nchain;
+	ELF_HASH_ENTRY *bucket, *chain;
+	ELF_HASH_ENTRY nbucket, nchain;
 
 	/* Version table */
 	ELF(Versym) *versym;
@@ -115,7 +121,7 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 	/*
 	 * Fish out the useful bits of the dynamic table.
 	 */
-	ELF(Word) *hash = 0;
+	ELF_HASH_ENTRY *hash = 0;
 	vdso_info.symstrings = 0;
 	vdso_info.symtab = 0;
 	vdso_info.versym = 0;
@@ -133,7 +139,7 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 				 + vdso_info.load_offset);
 			break;
 		case DT_HASH:
-			hash = (ELF(Word) *)
+			hash = (ELF_HASH_ENTRY *)
 				((uintptr_t)dyn[i].d_un.d_ptr
 				 + vdso_info.load_offset);
 			break;
-- 
cgit v1.2.3


From a6e23fb8d3c0e3904da70beaf5d7e840a983c97f Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Wed, 11 Sep 2024 10:50:15 +0200
Subject: selftests: vDSO: fix vdso_config for s390

Running vdso_test_correctness on s390x (aka s390 64 bit) emits a warning:

Warning: failed to find clock_gettime64 in vDSO

This is caused by the "#elif defined (__s390__)" check in vdso_config.h
which the defines VDSO_32BIT.

If __s390x__ is defined also __s390__ is defined. Therefore the correct
check must make sure that only __s390__ is defined.

Therefore add the missing !defined(__s390x__). Also use common
__s390x__ define instead of __s390X__.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Fixes: 693f5ca08ca0 ("kselftest: Extend vDSO selftest")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/testing/selftests/vDSO/vdso_config.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h
index 740ce8c98d2e..722260f97561 100644
--- a/tools/testing/selftests/vDSO/vdso_config.h
+++ b/tools/testing/selftests/vDSO/vdso_config.h
@@ -25,11 +25,11 @@
 #define VDSO_VERSION		1
 #define VDSO_NAMES		0
 #define VDSO_32BIT		1
-#elif defined (__s390__)
+#elif defined (__s390__) && !defined(__s390x__)
 #define VDSO_VERSION		2
 #define VDSO_NAMES		0
 #define VDSO_32BIT		1
-#elif defined (__s390X__)
+#elif defined (__s390x__)
 #define VDSO_VERSION		2
 #define VDSO_NAMES		0
 #elif defined(__mips__)
-- 
cgit v1.2.3


From b920aa77be6d57d407b6b6be5d8b4ccc17749cce Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Fri, 13 Sep 2024 15:05:43 +0200
Subject: s390/vdso: Wire up getrandom() vdso implementation

Provide the s390 specific vdso getrandom() architecture backend.

_vdso_rng_data required data is placed within the _vdso_data vvar page,
by using a hardcoded offset larger than vdso_data.

As required the chacha20 implementation does not write to the stack.

The implementation follows more or less the arm64 implementations and
makes use of vector instructions. It has a fallback to the getrandom()
system call for machines where the vector facility is not installed.

The check if the vector facility is installed, as well as an
optimization for machines with the vector-enhancements facility 2, is
implemented with alternatives, avoiding runtime checks.

Note that __kernel_getrandom() is implemented without the vdso user
wrapper which would setup a stack frame for odd cases (aka very old
glibc variants) where the caller has not done that. All callers of
__kernel_getrandom() are required to setup a stack frame, like the C ABI
requires it.

The vdso testcases vdso_test_getrandom and vdso_test_chacha pass.

Benchmark on a z16:

    $ ./vdso_test_getrandom bench-single
       vdso: 25000000 times in 0.493703559 seconds
    syscall: 25000000 times in 6.584025337 seconds

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Harald Freudenberger <freude@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/s390/Kconfig                           |   1 +
 arch/s390/include/asm/fpu-insn-asm.h        |  22 ++++
 arch/s390/include/asm/vdso/getrandom.h      |  40 ++++++
 arch/s390/include/asm/vdso/vsyscall.h       |  15 +++
 arch/s390/kernel/vdso.c                     |   7 +-
 arch/s390/kernel/vdso64/Makefile            |   9 +-
 arch/s390/kernel/vdso64/vdso.h              |   1 +
 arch/s390/kernel/vdso64/vdso64.lds.S        |   3 +
 arch/s390/kernel/vdso64/vgetrandom-chacha.S | 185 ++++++++++++++++++++++++++++
 arch/s390/kernel/vdso64/vgetrandom.c        |  14 +++
 tools/arch/s390/vdso                        |   1 +
 tools/testing/selftests/vDSO/Makefile       |   2 +-
 12 files changed, 291 insertions(+), 9 deletions(-)
 create mode 100644 arch/s390/include/asm/vdso/getrandom.h
 create mode 100644 arch/s390/kernel/vdso64/vgetrandom-chacha.S
 create mode 100644 arch/s390/kernel/vdso64/vgetrandom.c
 create mode 120000 tools/arch/s390/vdso

(limited to 'tools')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c60e699e99f5..b0d0b3a8d196 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -243,6 +243,7 @@ config S390
 	select TRACE_IRQFLAGS_SUPPORT
 	select TTY
 	select USER_STACKTRACE_SUPPORT
+	select VDSO_GETRANDOM
 	select VIRT_CPU_ACCOUNTING
 	select ZONE_DMA
 	# Note: keep the above list sorted alphabetically
diff --git a/arch/s390/include/asm/fpu-insn-asm.h b/arch/s390/include/asm/fpu-insn-asm.h
index 02ccfe46050a..d296322be4bc 100644
--- a/arch/s390/include/asm/fpu-insn-asm.h
+++ b/arch/s390/include/asm/fpu-insn-asm.h
@@ -407,6 +407,28 @@
 	MRXBOPC	0, 0x0E, v1
 .endm
 
+/* VECTOR STORE BYTE REVERSED ELEMENTS */
+	.macro	VSTBR	vr1, disp, index="%r0", base, m
+	VX_NUM	v1, \vr1
+	GR_NUM	x2, \index
+	GR_NUM	b2, \base
+	.word	0xE600 | ((v1&15) << 4) | (x2&15)
+	.word	(b2 << 12) | (\disp)
+	MRXBOPC	\m, 0x0E, v1
+.endm
+.macro	VSTBRH	vr1, disp, index="%r0", base
+	VSTBR	\vr1, \disp, \index, \base, 1
+.endm
+.macro	VSTBRF	vr1, disp, index="%r0", base
+	VSTBR	\vr1, \disp, \index, \base, 2
+.endm
+.macro	VSTBRG	vr1, disp, index="%r0", base
+	VSTBR	\vr1, \disp, \index, \base, 3
+.endm
+.macro	VSTBRQ	vr1, disp, index="%r0", base
+	VSTBR	\vr1, \disp, \index, \base, 4
+.endm
+
 /* VECTOR STORE MULTIPLE */
 .macro	VSTM	vfrom, vto, disp, base, hint=3
 	VX_NUM	v1, \vfrom
diff --git a/arch/s390/include/asm/vdso/getrandom.h b/arch/s390/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..36355af7160b
--- /dev/null
+++ b/arch/s390/include/asm/vdso/getrandom.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <vdso/datapage.h>
+#include <asm/vdso/vsyscall.h>
+#include <asm/syscall.h>
+#include <asm/unistd.h>
+#include <asm/page.h>
+
+/**
+ * getrandom_syscall - Invoke the getrandom() syscall.
+ * @buffer:	Destination buffer to fill with random bytes.
+ * @len:	Size of @buffer in bytes.
+ * @flags:	Zero or more GRND_* flags.
+ * Returns:	The number of random bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
+{
+	return syscall3(__NR_getrandom, (long)buffer, (long)len, (long)flags);
+}
+
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+	/*
+	 * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace
+	 * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_
+	 * PAGE_OFFSET points to the real VVAR page.
+	 */
+	if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
+		return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE;
+	return &_vdso_rng_data;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h
index 6c67c08cefdd..3c5d5e47814e 100644
--- a/arch/s390/include/asm/vdso/vsyscall.h
+++ b/arch/s390/include/asm/vdso/vsyscall.h
@@ -2,12 +2,21 @@
 #ifndef __ASM_VDSO_VSYSCALL_H
 #define __ASM_VDSO_VSYSCALL_H
 
+#define __VDSO_RND_DATA_OFFSET	768
+
 #ifndef __ASSEMBLY__
 
 #include <linux/hrtimer.h>
 #include <linux/timekeeper_internal.h>
 #include <vdso/datapage.h>
 #include <asm/vdso.h>
+
+enum vvar_pages {
+	VVAR_DATA_PAGE_OFFSET,
+	VVAR_TIMENS_PAGE_OFFSET,
+	VVAR_NR_PAGES
+};
+
 /*
  * Update the vDSO data page to keep in sync with kernel timekeeping.
  */
@@ -18,6 +27,12 @@ static __always_inline struct vdso_data *__s390_get_k_vdso_data(void)
 }
 #define __arch_get_k_vdso_data __s390_get_k_vdso_data
 
+static __always_inline struct vdso_rng_data *__s390_get_k_vdso_rnd_data(void)
+{
+	return (void *)vdso_data + __VDSO_RND_DATA_OFFSET;
+}
+#define __arch_get_k_vdso_rng_data __s390_get_k_vdso_rnd_data
+
 /* The asm-generic header needs to be included after the definitions above */
 #include <asm-generic/vdso/vsyscall.h>
 
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 8e4e6b316337..598b512cde01 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -19,6 +19,7 @@
 #include <linux/time_namespace.h>
 #include <linux/random.h>
 #include <vdso/datapage.h>
+#include <asm/vdso/vsyscall.h>
 #include <asm/alternative.h>
 #include <asm/vdso.h>
 
@@ -31,12 +32,6 @@ static union vdso_data_store vdso_data_store __page_aligned_data;
 
 struct vdso_data *vdso_data = vdso_data_store.data;
 
-enum vvar_pages {
-	VVAR_DATA_PAGE_OFFSET,
-	VVAR_TIMENS_PAGE_OFFSET,
-	VVAR_NR_PAGES,
-};
-
 #ifdef CONFIG_TIME_NS
 struct vdso_data *arch_get_vdso_data(void *vvar_page)
 {
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index ba19c0ca7c87..37bb4b761229 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -3,12 +3,17 @@
 
 # Include the generic Makefile to check the built vdso.
 include $(srctree)/lib/vdso/Makefile
-obj-vdso64 = vdso_user_wrapper.o note.o
-obj-cvdso64 = vdso64_generic.o getcpu.o
+obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o
+obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o
 VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK)
 CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE)
+CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE)
 CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE)
 
+ifneq ($(c-getrandom-y),)
+	CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
+endif
+
 # Build rules
 
 targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg
diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h
index 34c7a2312f9d..9e5397e7b590 100644
--- a/arch/s390/kernel/vdso64/vdso.h
+++ b/arch/s390/kernel/vdso64/vdso.h
@@ -10,5 +10,6 @@ int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unuse
 int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
 int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
 int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts);
+ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len);
 
 #endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index fa02c6ae3cac..753040a4b5ab 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -4,6 +4,7 @@
  * library
  */
 
+#include <asm/vdso/vsyscall.h>
 #include <asm/page.h>
 #include <asm/vdso.h>
 
@@ -13,6 +14,7 @@ OUTPUT_ARCH(s390:64-bit)
 SECTIONS
 {
 	PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+	PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET);
 #ifdef CONFIG_TIME_NS
 	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
 #endif
@@ -144,6 +146,7 @@ VERSION
 		__kernel_restart_syscall;
 		__kernel_rt_sigreturn;
 		__kernel_sigreturn;
+		__kernel_getrandom;
 	local: *;
 	};
 }
diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S
new file mode 100644
index 000000000000..d802b0a96f41
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+#include <asm/fpu-insn.h>
+
+#define STATE0	%v0
+#define STATE1	%v1
+#define STATE2	%v2
+#define STATE3	%v3
+#define COPY0	%v4
+#define COPY1	%v5
+#define COPY2	%v6
+#define COPY3	%v7
+#define PERM4	%v16
+#define PERM8	%v17
+#define PERM12	%v18
+#define BEPERM	%v19
+#define TMP0	%v20
+#define TMP1	%v21
+#define TMP2	%v22
+#define TMP3	%v23
+
+	.section .rodata
+
+	.balign 128
+.Lconstants:
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
+	.long	0x04050607,0x08090a0b,0x0c0d0e0f,0x00010203 # rotl  4 bytes
+	.long	0x08090a0b,0x0c0d0e0f,0x00010203,0x04050607 # rotl  8 bytes
+	.long	0x0c0d0e0f,0x00010203,0x04050607,0x08090a0b # rotl 12 bytes
+	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
+
+	.text
+/*
+ * s390 ChaCha20 implementation meant for vDSO. Produces a given positive
+ * number of blocks of output with nonce 0, taking an input key and 8-bytes
+ * counter. Does not spill to the stack.
+ *
+ * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
+ *				       const uint8_t *key,
+ *				       uint32_t *counter,
+ *				       size_t nblocks)
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+	larl	%r1,.Lconstants
+
+	/* COPY0 = "expand 32-byte k" */
+	VL	COPY0,0,,%r1
+
+	/* PERM4-PERM12,BEPERM = byte selectors for VPERM */
+	VLM	PERM4,BEPERM,16,%r1
+
+	/* COPY1,COPY2 = key */
+	VLM	COPY1,COPY2,0,%r3
+
+	/* COPY3 = counter || zero nonce  */
+	lg	%r3,0(%r4)
+	VZERO	COPY3
+	VLVGG	COPY3,%r3,0
+
+	lghi	%r1,0
+.Lblock:
+	VLR	STATE0,COPY0
+	VLR	STATE1,COPY1
+	VLR	STATE2,COPY2
+	VLR	STATE3,COPY3
+
+	lghi	%r0,10
+.Ldoubleround:
+	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
+	VAF	STATE0,STATE0,STATE1
+	VX	STATE3,STATE3,STATE0
+	VERLLF	STATE3,STATE3,16
+
+	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
+	VAF	STATE2,STATE2,STATE3
+	VX	STATE1,STATE1,STATE2
+	VERLLF	STATE1,STATE1,12
+
+	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
+	VAF	STATE0,STATE0,STATE1
+	VX	STATE3,STATE3,STATE0
+	VERLLF	STATE3,STATE3,8
+
+	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
+	VAF	STATE2,STATE2,STATE3
+	VX	STATE1,STATE1,STATE2
+	VERLLF	STATE1,STATE1,7
+
+	/* STATE1[0,1,2,3] = STATE1[1,2,3,0] */
+	VPERM	STATE1,STATE1,STATE1,PERM4
+	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
+	VPERM	STATE2,STATE2,STATE2,PERM8
+	/* STATE3[0,1,2,3] = STATE3[3,0,1,2] */
+	VPERM	STATE3,STATE3,STATE3,PERM12
+
+	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
+	VAF	STATE0,STATE0,STATE1
+	VX	STATE3,STATE3,STATE0
+	VERLLF	STATE3,STATE3,16
+
+	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
+	VAF	STATE2,STATE2,STATE3
+	VX	STATE1,STATE1,STATE2
+	VERLLF	STATE1,STATE1,12
+
+	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
+	VAF	STATE0,STATE0,STATE1
+	VX	STATE3,STATE3,STATE0
+	VERLLF	STATE3,STATE3,8
+
+	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
+	VAF	STATE2,STATE2,STATE3
+	VX	STATE1,STATE1,STATE2
+	VERLLF	STATE1,STATE1,7
+
+	/* STATE1[0,1,2,3] = STATE1[3,0,1,2] */
+	VPERM	STATE1,STATE1,STATE1,PERM12
+	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
+	VPERM	STATE2,STATE2,STATE2,PERM8
+	/* STATE3[0,1,2,3] = STATE3[1,2,3,0] */
+	VPERM	STATE3,STATE3,STATE3,PERM4
+	brctg	%r0,.Ldoubleround
+
+	/* OUTPUT0 = STATE0 + STATE0 */
+	VAF	STATE0,STATE0,COPY0
+	/* OUTPUT1 = STATE1 + STATE1 */
+	VAF	STATE1,STATE1,COPY1
+	/* OUTPUT2 = STATE2 + STATE2 */
+	VAF	STATE2,STATE2,COPY2
+	/* OUTPUT2 = STATE3 + STATE3 */
+	VAF	STATE3,STATE3,COPY3
+
+	/*
+	 * 32 bit wise little endian store to OUTPUT. If the vector
+	 * enhancement facility 2 is not installed use the slow path.
+	 */
+	ALTERNATIVE "brc 0xf,.Lstoreslow", "nop", ALT_FACILITY(148)
+	VSTBRF	STATE0,0,,%r2
+	VSTBRF	STATE1,16,,%r2
+	VSTBRF	STATE2,32,,%r2
+	VSTBRF	STATE3,48,,%r2
+.Lstoredone:
+
+	/* ++COPY3.COUNTER */
+	/* alsih %r3,1 */
+	.insn	rilu,0xcc0a00000000,%r3,1
+	alcr	%r3,%r1
+	VLVGG	COPY3,%r3,0
+
+	/* OUTPUT += 64, --NBLOCKS */
+	aghi	%r2,64
+	brctg	%r5,.Lblock
+
+	/* COUNTER = COPY3.COUNTER */
+	stg	%r3,0(%r4)
+
+	/* Zero out potentially sensitive regs */
+	VZERO	STATE0
+	VZERO	STATE1
+	VZERO	STATE2
+	VZERO	STATE3
+	VZERO	COPY1
+	VZERO	COPY2
+
+	/* Early exit if TMP0-TMP3 have not been used */
+	ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148)
+
+	VZERO	TMP0
+	VZERO	TMP1
+	VZERO	TMP2
+	VZERO	TMP3
+
+	br	%r14
+
+.Lstoreslow:
+	/* Convert STATE to little endian format and store to OUTPUT */
+	VPERM	TMP0,STATE0,STATE0,BEPERM
+	VPERM	TMP1,STATE1,STATE1,BEPERM
+	VPERM	TMP2,STATE2,STATE2,BEPERM
+	VPERM	TMP3,STATE3,STATE3,BEPERM
+	VSTM	TMP0,TMP3,0,%r2
+	j	.Lstoredone
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/s390/kernel/vdso64/vgetrandom.c b/arch/s390/kernel/vdso64/vgetrandom.c
new file mode 100644
index 000000000000..b5268b507fb5
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vgetrandom.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/facility.h>
+#include <uapi/asm-generic/errno.h>
+#include "vdso.h"
+
+ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+	if (test_facility(129))
+		return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+	if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags))
+		return -ENOSYS;
+	return getrandom_syscall(buffer, len, flags);
+}
diff --git a/tools/arch/s390/vdso b/tools/arch/s390/vdso
new file mode 120000
index 000000000000..6cf4c1cebdcd
--- /dev/null
+++ b/tools/arch/s390/vdso
@@ -0,0 +1 @@
+../../../arch/s390/kernel/vdso64
\ No newline at end of file
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 86ebc4115eda..af9cedbf5357 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -9,7 +9,7 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
 TEST_GEN_PROGS += vdso_standalone_test_x86
 endif
 TEST_GEN_PROGS += vdso_test_correctness
-ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch arm64 powerpc))
+ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch arm64 powerpc s390))
 TEST_GEN_PROGS += vdso_test_getrandom
 TEST_GEN_PROGS += vdso_test_chacha
 endif
-- 
cgit v1.2.3


From a18062d54a0ba35d22d2c7d3450964ada2a2ad7c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Fri, 13 Sep 2024 08:03:32 -0700
Subject: selftests/bpf: Add tests for sdiv/smod overflow cases

Subtests are added to exercise the patched code which handles
  - LLONG_MIN/-1
  - INT_MIN/-1
  - LLONG_MIN%-1
  - INT_MIN%-1
where -1 could be an immediate or in a register.
Without the previous patch, all these cases will crash the kernel on
x86_64 platform.

Additional tests are added to use small values (e.g. -5/-1, 5%-1, etc.)
in order to exercise the additional logic with patched insns.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240913150332.1188102-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_sdiv.c | 439 ++++++++++++++++++++++
 1 file changed, 439 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_sdiv.c b/tools/testing/selftests/bpf/progs/verifier_sdiv.c
index 2a2271cf0294..148d2299e5b4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_sdiv.c
+++ b/tools/testing/selftests/bpf/progs/verifier_sdiv.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/bpf.h>
+#include <limits.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 
@@ -770,6 +771,444 @@ __naked void smod64_zero_divisor(void)
 "	::: __clobber_all);
 }
 
+SEC("socket")
+__description("SDIV64, overflow r/r, LLONG_MIN/-1")
+__success __retval(1)
+__arch_x86_64
+__xlated("0: r2 = 0x8000000000000000")
+__xlated("2: r3 = -1")
+__xlated("3: r4 = r2")
+__xlated("4: r11 = r3")
+__xlated("5: r11 += 1")
+__xlated("6: if r11 > 0x1 goto pc+4")
+__xlated("7: if r11 == 0x0 goto pc+1")
+__xlated("8: r2 = 0")
+__xlated("9: r2 = -r2")
+__xlated("10: goto pc+1")
+__xlated("11: r2 s/= r3")
+__xlated("12: r0 = 0")
+__xlated("13: if r2 != r4 goto pc+1")
+__xlated("14: r0 = 1")
+__xlated("15: exit")
+__naked void sdiv64_overflow_rr(void)
+{
+	asm volatile ("					\
+	r2 = %[llong_min] ll;				\
+	r3 = -1;					\
+	r4 = r2;					\
+	r2 s/= r3;					\
+	r0 = 0;						\
+	if r2 != r4 goto +1;				\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_const(llong_min, LLONG_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, r/r, small_val/-1")
+__success __retval(-5)
+__arch_x86_64
+__xlated("0: r2 = 5")
+__xlated("1: r3 = -1")
+__xlated("2: r11 = r3")
+__xlated("3: r11 += 1")
+__xlated("4: if r11 > 0x1 goto pc+4")
+__xlated("5: if r11 == 0x0 goto pc+1")
+__xlated("6: r2 = 0")
+__xlated("7: r2 = -r2")
+__xlated("8: goto pc+1")
+__xlated("9: r2 s/= r3")
+__xlated("10: r0 = r2")
+__xlated("11: exit")
+__naked void sdiv64_rr_divisor_neg_1(void)
+{
+	asm volatile ("					\
+	r2 = 5;						\
+	r3 = -1;					\
+	r2 s/= r3;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	:
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, overflow r/i, LLONG_MIN/-1")
+__success __retval(1)
+__arch_x86_64
+__xlated("0: r2 = 0x8000000000000000")
+__xlated("2: r4 = r2")
+__xlated("3: r2 = -r2")
+__xlated("4: r0 = 0")
+__xlated("5: if r2 != r4 goto pc+1")
+__xlated("6: r0 = 1")
+__xlated("7: exit")
+__naked void sdiv64_overflow_ri(void)
+{
+	asm volatile ("					\
+	r2 = %[llong_min] ll;				\
+	r4 = r2;					\
+	r2 s/= -1;					\
+	r0 = 0;						\
+	if r2 != r4 goto +1;				\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_const(llong_min, LLONG_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, r/i, small_val/-1")
+__success __retval(-5)
+__arch_x86_64
+__xlated("0: r2 = 5")
+__xlated("1: r4 = r2")
+__xlated("2: r2 = -r2")
+__xlated("3: r0 = r2")
+__xlated("4: exit")
+__naked void sdiv64_ri_divisor_neg_1(void)
+{
+	asm volatile ("					\
+	r2 = 5;						\
+	r4 = r2;					\
+	r2 s/= -1;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	:
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, overflow r/r, INT_MIN/-1")
+__success __retval(1)
+__arch_x86_64
+__xlated("0: w2 = -2147483648")
+__xlated("1: w3 = -1")
+__xlated("2: w4 = w2")
+__xlated("3: r11 = r3")
+__xlated("4: w11 += 1")
+__xlated("5: if w11 > 0x1 goto pc+4")
+__xlated("6: if w11 == 0x0 goto pc+1")
+__xlated("7: w2 = 0")
+__xlated("8: w2 = -w2")
+__xlated("9: goto pc+1")
+__xlated("10: w2 s/= w3")
+__xlated("11: r0 = 0")
+__xlated("12: if w2 != w4 goto pc+1")
+__xlated("13: r0 = 1")
+__xlated("14: exit")
+__naked void sdiv32_overflow_rr(void)
+{
+	asm volatile ("					\
+	w2 = %[int_min];				\
+	w3 = -1;					\
+	w4 = w2;					\
+	w2 s/= w3;					\
+	r0 = 0;						\
+	if w2 != w4 goto +1;				\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_const(int_min, INT_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, r/r, small_val/-1")
+__success __retval(5)
+__arch_x86_64
+__xlated("0: w2 = -5")
+__xlated("1: w3 = -1")
+__xlated("2: w4 = w2")
+__xlated("3: r11 = r3")
+__xlated("4: w11 += 1")
+__xlated("5: if w11 > 0x1 goto pc+4")
+__xlated("6: if w11 == 0x0 goto pc+1")
+__xlated("7: w2 = 0")
+__xlated("8: w2 = -w2")
+__xlated("9: goto pc+1")
+__xlated("10: w2 s/= w3")
+__xlated("11: w0 = w2")
+__xlated("12: exit")
+__naked void sdiv32_rr_divisor_neg_1(void)
+{
+	asm volatile ("					\
+	w2 = -5;					\
+	w3 = -1;					\
+	w4 = w2;					\
+	w2 s/= w3;					\
+	w0 = w2;					\
+	exit;						\
+"	:
+	:
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, overflow r/i, INT_MIN/-1")
+__success __retval(1)
+__arch_x86_64
+__xlated("0: w2 = -2147483648")
+__xlated("1: w4 = w2")
+__xlated("2: w2 = -w2")
+__xlated("3: r0 = 0")
+__xlated("4: if w2 != w4 goto pc+1")
+__xlated("5: r0 = 1")
+__xlated("6: exit")
+__naked void sdiv32_overflow_ri(void)
+{
+	asm volatile ("					\
+	w2 = %[int_min];				\
+	w4 = w2;					\
+	w2 s/= -1;					\
+	r0 = 0;						\
+	if w2 != w4 goto +1;				\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_const(int_min, INT_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, r/i, small_val/-1")
+__success __retval(-5)
+__arch_x86_64
+__xlated("0: w2 = 5")
+__xlated("1: w4 = w2")
+__xlated("2: w2 = -w2")
+__xlated("3: w0 = w2")
+__xlated("4: exit")
+__naked void sdiv32_ri_divisor_neg_1(void)
+{
+	asm volatile ("					\
+	w2 = 5;						\
+	w4 = w2;					\
+	w2 s/= -1;					\
+	w0 = w2;					\
+	exit;						\
+"	:
+	:
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, overflow r/r, LLONG_MIN/-1")
+__success __retval(0)
+__arch_x86_64
+__xlated("0: r2 = 0x8000000000000000")
+__xlated("2: r3 = -1")
+__xlated("3: r4 = r2")
+__xlated("4: r11 = r3")
+__xlated("5: r11 += 1")
+__xlated("6: if r11 > 0x1 goto pc+3")
+__xlated("7: if r11 == 0x1 goto pc+3")
+__xlated("8: w2 = 0")
+__xlated("9: goto pc+1")
+__xlated("10: r2 s%= r3")
+__xlated("11: r0 = r2")
+__xlated("12: exit")
+__naked void smod64_overflow_rr(void)
+{
+	asm volatile ("					\
+	r2 = %[llong_min] ll;				\
+	r3 = -1;					\
+	r4 = r2;					\
+	r2 s%%= r3;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	: __imm_const(llong_min, LLONG_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, r/r, small_val/-1")
+__success __retval(0)
+__arch_x86_64
+__xlated("0: r2 = 5")
+__xlated("1: r3 = -1")
+__xlated("2: r4 = r2")
+__xlated("3: r11 = r3")
+__xlated("4: r11 += 1")
+__xlated("5: if r11 > 0x1 goto pc+3")
+__xlated("6: if r11 == 0x1 goto pc+3")
+__xlated("7: w2 = 0")
+__xlated("8: goto pc+1")
+__xlated("9: r2 s%= r3")
+__xlated("10: r0 = r2")
+__xlated("11: exit")
+__naked void smod64_rr_divisor_neg_1(void)
+{
+	asm volatile ("					\
+	r2 = 5;						\
+	r3 = -1;					\
+	r4 = r2;					\
+	r2 s%%= r3;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	:
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, overflow r/i, LLONG_MIN/-1")
+__success __retval(0)
+__arch_x86_64
+__xlated("0: r2 = 0x8000000000000000")
+__xlated("2: r4 = r2")
+__xlated("3: w2 = 0")
+__xlated("4: r0 = r2")
+__xlated("5: exit")
+__naked void smod64_overflow_ri(void)
+{
+	asm volatile ("					\
+	r2 = %[llong_min] ll;				\
+	r4 = r2;					\
+	r2 s%%= -1;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	: __imm_const(llong_min, LLONG_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, r/i, small_val/-1")
+__success __retval(0)
+__arch_x86_64
+__xlated("0: r2 = 5")
+__xlated("1: r4 = r2")
+__xlated("2: w2 = 0")
+__xlated("3: r0 = r2")
+__xlated("4: exit")
+__naked void smod64_ri_divisor_neg_1(void)
+{
+	asm volatile ("					\
+	r2 = 5;						\
+	r4 = r2;					\
+	r2 s%%= -1;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	:
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, overflow r/r, INT_MIN/-1")
+__success __retval(0)
+__arch_x86_64
+__xlated("0: w2 = -2147483648")
+__xlated("1: w3 = -1")
+__xlated("2: w4 = w2")
+__xlated("3: r11 = r3")
+__xlated("4: w11 += 1")
+__xlated("5: if w11 > 0x1 goto pc+3")
+__xlated("6: if w11 == 0x1 goto pc+4")
+__xlated("7: w2 = 0")
+__xlated("8: goto pc+1")
+__xlated("9: w2 s%= w3")
+__xlated("10: goto pc+1")
+__xlated("11: w2 = w2")
+__xlated("12: r0 = r2")
+__xlated("13: exit")
+__naked void smod32_overflow_rr(void)
+{
+	asm volatile ("					\
+	w2 = %[int_min];				\
+	w3 = -1;					\
+	w4 = w2;					\
+	w2 s%%= w3;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	: __imm_const(int_min, INT_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, r/r, small_val/-1")
+__success __retval(0)
+__arch_x86_64
+__xlated("0: w2 = -5")
+__xlated("1: w3 = -1")
+__xlated("2: w4 = w2")
+__xlated("3: r11 = r3")
+__xlated("4: w11 += 1")
+__xlated("5: if w11 > 0x1 goto pc+3")
+__xlated("6: if w11 == 0x1 goto pc+4")
+__xlated("7: w2 = 0")
+__xlated("8: goto pc+1")
+__xlated("9: w2 s%= w3")
+__xlated("10: goto pc+1")
+__xlated("11: w2 = w2")
+__xlated("12: r0 = r2")
+__xlated("13: exit")
+__naked void smod32_rr_divisor_neg_1(void)
+{
+	asm volatile ("					\
+	w2 = -5;				\
+	w3 = -1;					\
+	w4 = w2;					\
+	w2 s%%= w3;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	:
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, overflow r/i, INT_MIN/-1")
+__success __retval(0)
+__arch_x86_64
+__xlated("0: w2 = -2147483648")
+__xlated("1: w4 = w2")
+__xlated("2: w2 = 0")
+__xlated("3: r0 = r2")
+__xlated("4: exit")
+__naked void smod32_overflow_ri(void)
+{
+	asm volatile ("					\
+	w2 = %[int_min];				\
+	w4 = w2;					\
+	w2 s%%= -1;					\
+	r0 = r2;					\
+	exit;						\
+"	:
+	: __imm_const(int_min, INT_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, r/i, small_val/-1")
+__success __retval(0)
+__arch_x86_64
+__xlated("0: w2 = 5")
+__xlated("1: w4 = w2")
+__xlated("2: w2 = 0")
+__xlated("3: w0 = w2")
+__xlated("4: exit")
+__naked void smod32_ri_divisor_neg_1(void)
+{
+	asm volatile ("					\
+	w2 = 5;						\
+	w4 = w2;					\
+	w2 s%%= -1;					\
+	w0 = w2;					\
+	exit;						\
+"	:
+	:
+	: __clobber_all);
+}
+
 #else
 
 SEC("socket")
-- 
cgit v1.2.3


From b8e188f023e07a733b47d5865311ade51878fe40 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 Sep 2024 21:17:51 +0200
Subject: selftests/bpf: Fix ARG_PTR_TO_LONG {half-,}uninitialized test

The assumption of 'in privileged mode reads from uninitialized stack locations
are permitted' is not quite correct since the verifier was probing for read
access rather than write access. Both tests need to be annotated as __success
for privileged and unprivileged.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240913191754.13290-6-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_int_ptr.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
index 9fc3fae5cd83..87206803c025 100644
--- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
@@ -8,7 +8,6 @@
 SEC("socket")
 __description("ARG_PTR_TO_LONG uninitialized")
 __success
-__failure_unpriv __msg_unpriv("invalid indirect read from stack R4 off -16+0 size 8")
 __naked void arg_ptr_to_long_uninitialized(void)
 {
 	asm volatile ("					\
@@ -36,9 +35,7 @@ __naked void arg_ptr_to_long_uninitialized(void)
 
 SEC("socket")
 __description("ARG_PTR_TO_LONG half-uninitialized")
-/* in privileged mode reads from uninitialized stack locations are permitted */
-__success __failure_unpriv
-__msg_unpriv("invalid indirect read from stack R4 off -16+4 size 8")
+__success
 __retval(0)
 __naked void ptr_to_long_half_uninitialized(void)
 {
-- 
cgit v1.2.3


From b073b82d4d3cee703d17e099c7e739082f15bd2e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 Sep 2024 21:17:52 +0200
Subject: selftests/bpf: Rename ARG_PTR_TO_LONG test description

Given we got rid of ARG_PTR_TO_LONG, change the test case description to
avoid potential confusion:

  # ./vmtest.sh -- ./test_progs -t verifier_int_ptr
  [...]
  ./test_progs -t verifier_int_ptr
  [    1.610563] bpf_testmod: loading out-of-tree module taints kernel.
  [    1.611049] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
  #489/1   verifier_int_ptr/arg pointer to long uninitialized:OK
  #489/2   verifier_int_ptr/arg pointer to long half-uninitialized:OK
  #489/3   verifier_int_ptr/arg pointer to long misaligned:OK
  #489/4   verifier_int_ptr/arg pointer to long size < sizeof(long):OK
  #489/5   verifier_int_ptr/arg pointer to long initialized:OK
  #489     verifier_int_ptr:OK
  Summary: 1/5 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20240913191754.13290-7-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_int_ptr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
index 87206803c025..5f2efb895edb 100644
--- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
@@ -6,7 +6,7 @@
 #include "bpf_misc.h"
 
 SEC("socket")
-__description("ARG_PTR_TO_LONG uninitialized")
+__description("arg pointer to long uninitialized")
 __success
 __naked void arg_ptr_to_long_uninitialized(void)
 {
@@ -34,7 +34,7 @@ __naked void arg_ptr_to_long_uninitialized(void)
 }
 
 SEC("socket")
-__description("ARG_PTR_TO_LONG half-uninitialized")
+__description("arg pointer to long half-uninitialized")
 __success
 __retval(0)
 __naked void ptr_to_long_half_uninitialized(void)
@@ -64,7 +64,7 @@ __naked void ptr_to_long_half_uninitialized(void)
 }
 
 SEC("cgroup/sysctl")
-__description("ARG_PTR_TO_LONG misaligned")
+__description("arg pointer to long misaligned")
 __failure __msg("misaligned stack access off 0+-20+0 size 8")
 __naked void arg_ptr_to_long_misaligned(void)
 {
@@ -95,7 +95,7 @@ __naked void arg_ptr_to_long_misaligned(void)
 }
 
 SEC("cgroup/sysctl")
-__description("ARG_PTR_TO_LONG size < sizeof(long)")
+__description("arg pointer to long size < sizeof(long)")
 __failure __msg("invalid indirect access to stack R4 off=-4 size=8")
 __naked void to_long_size_sizeof_long(void)
 {
@@ -124,7 +124,7 @@ __naked void to_long_size_sizeof_long(void)
 }
 
 SEC("cgroup/sysctl")
-__description("ARG_PTR_TO_LONG initialized")
+__description("arg pointer to long initialized")
 __success
 __naked void arg_ptr_to_long_initialized(void)
 {
-- 
cgit v1.2.3


From 2e3f066020478316d7e13e0e2b9fe571c2f7f5af Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 Sep 2024 21:17:53 +0200
Subject: selftests/bpf: Add a test case to write strtol result into .rodata

Add a test case which attempts to write into .rodata section of the
BPF program, and for comparison this adds test cases also for .bss
and .data section.

Before fix:

  # ./vmtest.sh -- ./test_progs -t verifier_const
  [...]
  ./test_progs -t verifier_const
  tester_init:PASS:tester_log_buf 0 nsec
  process_subtest:PASS:obj_open_mem 0 nsec
  process_subtest:PASS:specs_alloc 0 nsec
  run_subtest:PASS:obj_open_mem 0 nsec
  run_subtest:FAIL:unexpected_load_success unexpected success: 0
  #465/1   verifier_const/rodata: write rejected:FAIL
  #465/2   verifier_const/bss: write accepted:OK
  #465/3   verifier_const/data: write accepted:OK
  #465     verifier_const:FAIL
  [...]

After fix:

  # ./vmtest.sh -- ./test_progs -t verifier_const
  [...]
  ./test_progs -t verifier_const
  #465/1   verifier_const/rodata: write rejected:OK
  #465/2   verifier_const/bss: write accepted:OK
  #465/3   verifier_const/data: write accepted:OK
  #465     verifier_const:OK
  [...]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240913191754.13290-8-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |  2 ++
 tools/testing/selftests/bpf/progs/verifier_const.c | 42 ++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_const.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index df398e714dff..e26b5150fc43 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -21,6 +21,7 @@
 #include "verifier_cgroup_inv_retcode.skel.h"
 #include "verifier_cgroup_skb.skel.h"
 #include "verifier_cgroup_storage.skel.h"
+#include "verifier_const.skel.h"
 #include "verifier_const_or.skel.h"
 #include "verifier_ctx.skel.h"
 #include "verifier_ctx_sk_msg.skel.h"
@@ -146,6 +147,7 @@ void test_verifier_cfg(void)                  { RUN(verifier_cfg); }
 void test_verifier_cgroup_inv_retcode(void)   { RUN(verifier_cgroup_inv_retcode); }
 void test_verifier_cgroup_skb(void)           { RUN(verifier_cgroup_skb); }
 void test_verifier_cgroup_storage(void)       { RUN(verifier_cgroup_storage); }
+void test_verifier_const(void)                { RUN(verifier_const); }
 void test_verifier_const_or(void)             { RUN(verifier_const_or); }
 void test_verifier_ctx(void)                  { RUN(verifier_ctx); }
 void test_verifier_ctx_sk_msg(void)           { RUN(verifier_ctx_sk_msg); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_const.c b/tools/testing/selftests/bpf/progs/verifier_const.c
new file mode 100644
index 000000000000..5158dbea8c43
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_const.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Isovalent */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+const volatile long foo = 42;
+long bar;
+long bart = 96;
+
+SEC("tc/ingress")
+__description("rodata: write rejected")
+__failure __msg("write into map forbidden")
+int tcx1(struct __sk_buff *skb)
+{
+	char buff[] = { '8', '4', '\0' };
+	bpf_strtol(buff, sizeof(buff), 0, (long *)&foo);
+	return TCX_PASS;
+}
+
+SEC("tc/ingress")
+__description("bss: write accepted")
+__success
+int tcx2(struct __sk_buff *skb)
+{
+	char buff[] = { '8', '4', '\0' };
+	bpf_strtol(buff, sizeof(buff), 0, &bar);
+	return TCX_PASS;
+}
+
+SEC("tc/ingress")
+__description("data: write accepted")
+__success
+int tcx3(struct __sk_buff *skb)
+{
+	char buff[] = { '8', '4', '\0' };
+	bpf_strtol(buff, sizeof(buff), 0, &bart);
+	return TCX_PASS;
+}
+
+char LICENSE[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 211bf9cf178a986f025b65cee11012d4e3d6b1f8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 Sep 2024 21:17:54 +0200
Subject: selftests/bpf: Add a test case to write mtu result into .rodata

Add a test which attempts to call bpf_check_mtu() and writes the MTU
into .rodata section of the BPF program, and for comparison this adds
test cases also for .bss and .data section again. The bpf_check_mtu()
is a bit more special in that the passed mtu argument is read and
written by the helper (instead of just written to). Assert that writes
into .rodata remain rejected by the verifier.

  # ./vmtest.sh -- ./test_progs -t verifier_const
  [...]
  ./test_progs -t verifier_const
  [    1.657367] bpf_testmod: loading out-of-tree module taints kernel.
  [    1.657773] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
  #473/1   verifier_const/rodata/strtol: write rejected:OK
  #473/2   verifier_const/bss/strtol: write accepted:OK
  #473/3   verifier_const/data/strtol: write accepted:OK
  #473/4   verifier_const/rodata/mtu: write rejected:OK
  #473/5   verifier_const/bss/mtu: write accepted:OK
  #473/6   verifier_const/data/mtu: write accepted:OK
  #473     verifier_const:OK
  [...]
  Summary: 2/10 PASSED, 0 SKIPPED, 0 FAILED

For comparison, without the MEM_UNINIT on bpf_check_mtu's proto:

  # ./vmtest.sh -- ./test_progs -t verifier_const
  [...]
  #473/3   verifier_const/data/strtol: write accepted:OK
  run_subtest:PASS:obj_open_mem 0 nsec
  run_subtest:FAIL:unexpected_load_success unexpected success: 0
  #473/4   verifier_const/rodata/mtu: write rejected:FAIL
  #473/5   verifier_const/bss/mtu: write accepted:OK
  #473/6   verifier_const/data/mtu: write accepted:OK
  #473     verifier_const:FAIL
  [...]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20240913191754.13290-9-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_const.c | 33 ++++++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_const.c b/tools/testing/selftests/bpf/progs/verifier_const.c
index 5158dbea8c43..2e533d7eec2f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_const.c
+++ b/tools/testing/selftests/bpf/progs/verifier_const.c
@@ -10,7 +10,7 @@ long bar;
 long bart = 96;
 
 SEC("tc/ingress")
-__description("rodata: write rejected")
+__description("rodata/strtol: write rejected")
 __failure __msg("write into map forbidden")
 int tcx1(struct __sk_buff *skb)
 {
@@ -20,7 +20,7 @@ int tcx1(struct __sk_buff *skb)
 }
 
 SEC("tc/ingress")
-__description("bss: write accepted")
+__description("bss/strtol: write accepted")
 __success
 int tcx2(struct __sk_buff *skb)
 {
@@ -30,7 +30,7 @@ int tcx2(struct __sk_buff *skb)
 }
 
 SEC("tc/ingress")
-__description("data: write accepted")
+__description("data/strtol: write accepted")
 __success
 int tcx3(struct __sk_buff *skb)
 {
@@ -39,4 +39,31 @@ int tcx3(struct __sk_buff *skb)
 	return TCX_PASS;
 }
 
+SEC("tc/ingress")
+__description("rodata/mtu: write rejected")
+__failure __msg("write into map forbidden")
+int tcx4(struct __sk_buff *skb)
+{
+	bpf_check_mtu(skb, skb->ifindex, (__u32 *)&foo, 0, 0);
+	return TCX_PASS;
+}
+
+SEC("tc/ingress")
+__description("bss/mtu: write accepted")
+__success
+int tcx5(struct __sk_buff *skb)
+{
+	bpf_check_mtu(skb, skb->ifindex, (__u32 *)&bar, 0, 0);
+	return TCX_PASS;
+}
+
+SEC("tc/ingress")
+__description("data/mtu: write accepted")
+__success
+int tcx6(struct __sk_buff *skb)
+{
+	bpf_check_mtu(skb, skb->ifindex, (__u32 *)&bart, 0, 0);
+	return TCX_PASS;
+}
+
 char LICENSE[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 313312c84b42d7b8ee1fb03ab30befc2bd0db211 Mon Sep 17 00:00:00 2001
From: Min-Hua Chen <minhuadotchen@gmail.com>
Date: Sat, 14 Sep 2024 04:17:02 +0800
Subject: pm: cpupower: rename raw_pylibcpupower.i

The raw_pylibcpupower.i is removed unexpectedly after 'make mrproper'

We can reproduce the error by performing the following steps:
cd linux-next
make mrproper
cd tools/power/cpupower/bindings/python
make

We will get an error message:
make: *** No rule to make target 'raw_pylibcpupower.i', needed by 'raw_pylibcpupower_wrap.c'.  Stop.

The root cause:

The *.i files are already used for pre-processor output files and
the kernel removes all the *.i files by 'make mrproper'.

That explains why the raw_pylibcpupower.i is removed by 'make mrproper'.

To fix it, Follow John's suggestion to rename raw_pylibcpupower.i to
raw_pylibcpupower.swg.

See:
https://www.swig.org/Doc4.2/SWIG.html

Reviewed-by: John B. Wyatt IV <jwyatt@redhat.com>
Reviewed-by: John B. Wyatt IV <sageofredondo@gmail.com>
Tested-by: John B. Wyatt IV <jwyatt@redhat.com>
Tested-by: John B. Wyatt IV <sageofredondo@gmail.com>
Signed-off-by: Min-Hua Chen <minhuadotchen@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/bindings/python/Makefile      |   4 +-
 .../cpupower/bindings/python/raw_pylibcpupower.i   | 247 ---------------------
 .../cpupower/bindings/python/raw_pylibcpupower.swg | 247 +++++++++++++++++++++
 3 files changed, 249 insertions(+), 249 deletions(-)
 delete mode 100644 tools/power/cpupower/bindings/python/raw_pylibcpupower.i
 create mode 100644 tools/power/cpupower/bindings/python/raw_pylibcpupower.swg

(limited to 'tools')

diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile
index dc09c5b66ead..e1ebb1d60cd4 100644
--- a/tools/power/cpupower/bindings/python/Makefile
+++ b/tools/power/cpupower/bindings/python/Makefile
@@ -20,13 +20,13 @@ _raw_pylibcpupower.so: raw_pylibcpupower_wrap.o
 raw_pylibcpupower_wrap.o: raw_pylibcpupower_wrap.c
 	$(CC) -fPIC -c raw_pylibcpupower_wrap.c $(PY_INCLUDE)
 
-raw_pylibcpupower_wrap.c: raw_pylibcpupower.i
+raw_pylibcpupower_wrap.c: raw_pylibcpupower.swg
 ifeq ($(HAVE_SWIG),0)
 	$(error "swig was not found. Make sure you have it installed and in the PATH to generate the bindings.")
 else ifeq ($(HAVE_PYCONFIG),0)
 	$(error "python-config was not found. Make sure you have it installed and in the PATH to generate the bindings.")
 endif
-	swig -python raw_pylibcpupower.i
+	swig -python raw_pylibcpupower.swg
 
 # Will only clean the bindings folder; will not clean the actual cpupower folder
 clean:
diff --git a/tools/power/cpupower/bindings/python/raw_pylibcpupower.i b/tools/power/cpupower/bindings/python/raw_pylibcpupower.i
deleted file mode 100644
index 96556d87a745..000000000000
--- a/tools/power/cpupower/bindings/python/raw_pylibcpupower.i
+++ /dev/null
@@ -1,247 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-
-%module raw_pylibcpupower
-%{
-#include "../../lib/cpupower_intern.h"
-#include "../../lib/acpi_cppc.h"
-#include "../../lib/cpufreq.h"
-#include "../../lib/cpuidle.h"
-#include "../../lib/cpupower.h"
-#include "../../lib/powercap.h"
-%}
-
-/*
- * cpupower_intern.h
- */
-
-#define PATH_TO_CPU "/sys/devices/system/cpu/"
-#define MAX_LINE_LEN 4096
-#define SYSFS_PATH_MAX 255
-
-int is_valid_path(const char *path);
-
-unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen);
-
-unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen);
-
-/*
- * acpi_cppc.h
- */
-
-enum acpi_cppc_value {
-	HIGHEST_PERF,
-	LOWEST_PERF,
-	NOMINAL_PERF,
-	LOWEST_NONLINEAR_PERF,
-	LOWEST_FREQ,
-	NOMINAL_FREQ,
-	REFERENCE_PERF,
-	WRAPAROUND_TIME,
-	MAX_CPPC_VALUE_FILES
-};
-
-unsigned long acpi_cppc_get_data(unsigned int cpu,
-				 enum acpi_cppc_value which);
-
-/*
- * cpufreq.h
- */
-
-struct cpufreq_policy {
-	unsigned long min;
-	unsigned long max;
-	char *governor;
-};
-
-struct cpufreq_available_governors {
-	char *governor;
-	struct cpufreq_available_governors *next;
-	struct cpufreq_available_governors *first;
-};
-
-struct cpufreq_available_frequencies {
-	unsigned long frequency;
-	struct cpufreq_available_frequencies *next;
-	struct cpufreq_available_frequencies *first;
-};
-
-
-struct cpufreq_affected_cpus {
-	unsigned int cpu;
-	struct cpufreq_affected_cpus *next;
-	struct cpufreq_affected_cpus *first;
-};
-
-struct cpufreq_stats {
-	unsigned long frequency;
-	unsigned long long time_in_state;
-	struct cpufreq_stats *next;
-	struct cpufreq_stats *first;
-};
-
-unsigned long cpufreq_get_freq_kernel(unsigned int cpu);
-
-unsigned long cpufreq_get_freq_hardware(unsigned int cpu);
-
-#define cpufreq_get(cpu) cpufreq_get_freq_kernel(cpu);
-
-unsigned long cpufreq_get_transition_latency(unsigned int cpu);
-
-int cpufreq_get_hardware_limits(unsigned int cpu,
-				unsigned long *min,
-				unsigned long *max);
-
-char *cpufreq_get_driver(unsigned int cpu);
-
-void cpufreq_put_driver(char *ptr);
-
-struct cpufreq_policy *cpufreq_get_policy(unsigned int cpu);
-
-void cpufreq_put_policy(struct cpufreq_policy *policy);
-
-struct cpufreq_available_governors
-*cpufreq_get_available_governors(unsigned int cpu);
-
-void cpufreq_put_available_governors(
-	struct cpufreq_available_governors *first);
-
-struct cpufreq_available_frequencies
-*cpufreq_get_available_frequencies(unsigned int cpu);
-
-void cpufreq_put_available_frequencies(
-		struct cpufreq_available_frequencies *first);
-
-struct cpufreq_available_frequencies
-*cpufreq_get_boost_frequencies(unsigned int cpu);
-
-void cpufreq_put_boost_frequencies(
-		struct cpufreq_available_frequencies *first);
-
-struct cpufreq_affected_cpus *cpufreq_get_affected_cpus(unsigned
-							int cpu);
-
-void cpufreq_put_affected_cpus(struct cpufreq_affected_cpus *first);
-
-struct cpufreq_affected_cpus *cpufreq_get_related_cpus(unsigned
-							int cpu);
-
-void cpufreq_put_related_cpus(struct cpufreq_affected_cpus *first);
-
-struct cpufreq_stats *cpufreq_get_stats(unsigned int cpu,
-					unsigned long long *total_time);
-
-void cpufreq_put_stats(struct cpufreq_stats *stats);
-
-unsigned long cpufreq_get_transitions(unsigned int cpu);
-
-int cpufreq_set_policy(unsigned int cpu, struct cpufreq_policy *policy);
-
-int cpufreq_modify_policy_min(unsigned int cpu, unsigned long min_freq);
-
-int cpufreq_modify_policy_max(unsigned int cpu, unsigned long max_freq);
-
-int cpufreq_modify_policy_governor(unsigned int cpu, char *governor);
-
-int cpufreq_set_frequency(unsigned int cpu,
-				unsigned long target_frequency);
-
-unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu,
-						 const char **table,
-						 unsigned int index,
-						 unsigned int size);
-
-/*
- * cpuidle.h
- */
-
-int cpuidle_is_state_disabled(unsigned int cpu,
-				       unsigned int idlestate);
-int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate,
-				   unsigned int disable);
-unsigned long cpuidle_state_latency(unsigned int cpu,
-						unsigned int idlestate);
-unsigned long cpuidle_state_usage(unsigned int cpu,
-					unsigned int idlestate);
-unsigned long long cpuidle_state_time(unsigned int cpu,
-						unsigned int idlestate);
-char *cpuidle_state_name(unsigned int cpu,
-				unsigned int idlestate);
-char *cpuidle_state_desc(unsigned int cpu,
-				unsigned int idlestate);
-unsigned int cpuidle_state_count(unsigned int cpu);
-
-char *cpuidle_get_governor(void);
-
-char *cpuidle_get_driver(void);
-
-/*
- * cpupower.h
- */
-
-struct cpupower_topology {
-	/* Amount of CPU cores, packages and threads per core in the system */
-	unsigned int cores;
-	unsigned int pkgs;
-	unsigned int threads; /* per core */
-
-	/* Array gets mallocated with cores entries, holding per core info */
-	struct cpuid_core_info *core_info;
-};
-
-struct cpuid_core_info {
-	int pkg;
-	int core;
-	int cpu;
-
-	/* flags */
-	unsigned int is_online:1;
-};
-
-int get_cpu_topology(struct cpupower_topology *cpu_top);
-
-void cpu_topology_release(struct cpupower_topology cpu_top);
-
-int cpupower_is_cpu_online(unsigned int cpu);
-
-/*
- * powercap.h
- */
-
-struct powercap_zone {
-	char name[MAX_LINE_LEN];
-	/*
-	 * sys_name relative to PATH_TO_POWERCAP,
-	 * do not forget the / in between
-	 */
-	char sys_name[SYSFS_PATH_MAX];
-	int tree_depth;
-	struct powercap_zone *parent;
-	struct powercap_zone *children[POWERCAP_MAX_CHILD_ZONES];
-	/* More possible caps or attributes to be added? */
-	uint32_t has_power_uw:1,
-		 has_energy_uj:1;
-
-};
-
-int powercap_walk_zones(struct powercap_zone *zone,
-			int (*f)(struct powercap_zone *zone));
-
-struct powercap_zone *powercap_init_zones(void);
-
-int powercap_get_enabled(int *mode);
-
-int powercap_set_enabled(int mode);
-
-int powercap_get_driver(char *driver, int buflen);
-
-int powercap_get_max_energy_range_uj(struct powercap_zone *zone, uint64_t *val);
-
-int powercap_get_energy_uj(struct powercap_zone *zone, uint64_t *val);
-
-int powercap_get_max_power_range_uw(struct powercap_zone *zone, uint64_t *val);
-
-int powercap_get_power_uw(struct powercap_zone *zone, uint64_t *val);
-
-int powercap_zone_get_enabled(struct powercap_zone *zone, int *mode);
-
-int powercap_zone_set_enabled(struct powercap_zone *zone, int mode);
diff --git a/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg b/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg
new file mode 100644
index 000000000000..96556d87a745
--- /dev/null
+++ b/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+%module raw_pylibcpupower
+%{
+#include "../../lib/cpupower_intern.h"
+#include "../../lib/acpi_cppc.h"
+#include "../../lib/cpufreq.h"
+#include "../../lib/cpuidle.h"
+#include "../../lib/cpupower.h"
+#include "../../lib/powercap.h"
+%}
+
+/*
+ * cpupower_intern.h
+ */
+
+#define PATH_TO_CPU "/sys/devices/system/cpu/"
+#define MAX_LINE_LEN 4096
+#define SYSFS_PATH_MAX 255
+
+int is_valid_path(const char *path);
+
+unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen);
+
+unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen);
+
+/*
+ * acpi_cppc.h
+ */
+
+enum acpi_cppc_value {
+	HIGHEST_PERF,
+	LOWEST_PERF,
+	NOMINAL_PERF,
+	LOWEST_NONLINEAR_PERF,
+	LOWEST_FREQ,
+	NOMINAL_FREQ,
+	REFERENCE_PERF,
+	WRAPAROUND_TIME,
+	MAX_CPPC_VALUE_FILES
+};
+
+unsigned long acpi_cppc_get_data(unsigned int cpu,
+				 enum acpi_cppc_value which);
+
+/*
+ * cpufreq.h
+ */
+
+struct cpufreq_policy {
+	unsigned long min;
+	unsigned long max;
+	char *governor;
+};
+
+struct cpufreq_available_governors {
+	char *governor;
+	struct cpufreq_available_governors *next;
+	struct cpufreq_available_governors *first;
+};
+
+struct cpufreq_available_frequencies {
+	unsigned long frequency;
+	struct cpufreq_available_frequencies *next;
+	struct cpufreq_available_frequencies *first;
+};
+
+
+struct cpufreq_affected_cpus {
+	unsigned int cpu;
+	struct cpufreq_affected_cpus *next;
+	struct cpufreq_affected_cpus *first;
+};
+
+struct cpufreq_stats {
+	unsigned long frequency;
+	unsigned long long time_in_state;
+	struct cpufreq_stats *next;
+	struct cpufreq_stats *first;
+};
+
+unsigned long cpufreq_get_freq_kernel(unsigned int cpu);
+
+unsigned long cpufreq_get_freq_hardware(unsigned int cpu);
+
+#define cpufreq_get(cpu) cpufreq_get_freq_kernel(cpu);
+
+unsigned long cpufreq_get_transition_latency(unsigned int cpu);
+
+int cpufreq_get_hardware_limits(unsigned int cpu,
+				unsigned long *min,
+				unsigned long *max);
+
+char *cpufreq_get_driver(unsigned int cpu);
+
+void cpufreq_put_driver(char *ptr);
+
+struct cpufreq_policy *cpufreq_get_policy(unsigned int cpu);
+
+void cpufreq_put_policy(struct cpufreq_policy *policy);
+
+struct cpufreq_available_governors
+*cpufreq_get_available_governors(unsigned int cpu);
+
+void cpufreq_put_available_governors(
+	struct cpufreq_available_governors *first);
+
+struct cpufreq_available_frequencies
+*cpufreq_get_available_frequencies(unsigned int cpu);
+
+void cpufreq_put_available_frequencies(
+		struct cpufreq_available_frequencies *first);
+
+struct cpufreq_available_frequencies
+*cpufreq_get_boost_frequencies(unsigned int cpu);
+
+void cpufreq_put_boost_frequencies(
+		struct cpufreq_available_frequencies *first);
+
+struct cpufreq_affected_cpus *cpufreq_get_affected_cpus(unsigned
+							int cpu);
+
+void cpufreq_put_affected_cpus(struct cpufreq_affected_cpus *first);
+
+struct cpufreq_affected_cpus *cpufreq_get_related_cpus(unsigned
+							int cpu);
+
+void cpufreq_put_related_cpus(struct cpufreq_affected_cpus *first);
+
+struct cpufreq_stats *cpufreq_get_stats(unsigned int cpu,
+					unsigned long long *total_time);
+
+void cpufreq_put_stats(struct cpufreq_stats *stats);
+
+unsigned long cpufreq_get_transitions(unsigned int cpu);
+
+int cpufreq_set_policy(unsigned int cpu, struct cpufreq_policy *policy);
+
+int cpufreq_modify_policy_min(unsigned int cpu, unsigned long min_freq);
+
+int cpufreq_modify_policy_max(unsigned int cpu, unsigned long max_freq);
+
+int cpufreq_modify_policy_governor(unsigned int cpu, char *governor);
+
+int cpufreq_set_frequency(unsigned int cpu,
+				unsigned long target_frequency);
+
+unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu,
+						 const char **table,
+						 unsigned int index,
+						 unsigned int size);
+
+/*
+ * cpuidle.h
+ */
+
+int cpuidle_is_state_disabled(unsigned int cpu,
+				       unsigned int idlestate);
+int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate,
+				   unsigned int disable);
+unsigned long cpuidle_state_latency(unsigned int cpu,
+						unsigned int idlestate);
+unsigned long cpuidle_state_usage(unsigned int cpu,
+					unsigned int idlestate);
+unsigned long long cpuidle_state_time(unsigned int cpu,
+						unsigned int idlestate);
+char *cpuidle_state_name(unsigned int cpu,
+				unsigned int idlestate);
+char *cpuidle_state_desc(unsigned int cpu,
+				unsigned int idlestate);
+unsigned int cpuidle_state_count(unsigned int cpu);
+
+char *cpuidle_get_governor(void);
+
+char *cpuidle_get_driver(void);
+
+/*
+ * cpupower.h
+ */
+
+struct cpupower_topology {
+	/* Amount of CPU cores, packages and threads per core in the system */
+	unsigned int cores;
+	unsigned int pkgs;
+	unsigned int threads; /* per core */
+
+	/* Array gets mallocated with cores entries, holding per core info */
+	struct cpuid_core_info *core_info;
+};
+
+struct cpuid_core_info {
+	int pkg;
+	int core;
+	int cpu;
+
+	/* flags */
+	unsigned int is_online:1;
+};
+
+int get_cpu_topology(struct cpupower_topology *cpu_top);
+
+void cpu_topology_release(struct cpupower_topology cpu_top);
+
+int cpupower_is_cpu_online(unsigned int cpu);
+
+/*
+ * powercap.h
+ */
+
+struct powercap_zone {
+	char name[MAX_LINE_LEN];
+	/*
+	 * sys_name relative to PATH_TO_POWERCAP,
+	 * do not forget the / in between
+	 */
+	char sys_name[SYSFS_PATH_MAX];
+	int tree_depth;
+	struct powercap_zone *parent;
+	struct powercap_zone *children[POWERCAP_MAX_CHILD_ZONES];
+	/* More possible caps or attributes to be added? */
+	uint32_t has_power_uw:1,
+		 has_energy_uj:1;
+
+};
+
+int powercap_walk_zones(struct powercap_zone *zone,
+			int (*f)(struct powercap_zone *zone));
+
+struct powercap_zone *powercap_init_zones(void);
+
+int powercap_get_enabled(int *mode);
+
+int powercap_set_enabled(int mode);
+
+int powercap_get_driver(char *driver, int buflen);
+
+int powercap_get_max_energy_range_uj(struct powercap_zone *zone, uint64_t *val);
+
+int powercap_get_energy_uj(struct powercap_zone *zone, uint64_t *val);
+
+int powercap_get_max_power_range_uw(struct powercap_zone *zone, uint64_t *val);
+
+int powercap_get_power_uw(struct powercap_zone *zone, uint64_t *val);
+
+int powercap_zone_get_enabled(struct powercap_zone *zone, int *mode);
+
+int powercap_zone_set_enabled(struct powercap_zone *zone, int mode);
-- 
cgit v1.2.3


From 5dd15cce0c5426fff7b30c792d908e93646cd8c5 Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Mon, 2 Sep 2024 12:12:40 +0800
Subject: tools: PCI: Remove .*.cmd files with make clean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove any leftover .*.cmd files with make clean.

No functional changes intended.

Link: https://lore.kernel.org/linux-pci/20240902041240.5475-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
[kwilczynski: commit log, move .*.cmd before .*.d to
align with other Makefiles, don't remove the newline]
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
---
 tools/pci/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/pci/Makefile b/tools/pci/Makefile
index 57744778b518..62d41f1a1e2c 100644
--- a/tools/pci/Makefile
+++ b/tools/pci/Makefile
@@ -42,7 +42,7 @@ $(OUTPUT)pcitest: $(PCITEST_IN)
 clean:
 	rm -f $(ALL_PROGRAMS)
 	rm -rf $(OUTPUT)include/
-	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete
+	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 
 install: $(ALL_PROGRAMS)
 	install -d -m 755 $(DESTDIR)$(bindir);		\
-- 
cgit v1.2.3


From d0aac667f2e02f114bc6c3bf8c085a6060599aef Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Wed, 11 Sep 2024 14:04:01 +0800
Subject: tools: PCI: Remove unused BILLION macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The macro BILLION is never referenced in the code.  Remove it.

Link: https://lore.kernel.org/r/20240911060401.9230-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
---
 tools/pci/pcitest.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c
index 441b54234635..470258009ddc 100644
--- a/tools/pci/pcitest.c
+++ b/tools/pci/pcitest.c
@@ -16,8 +16,6 @@
 
 #include <linux/pcitest.h>
 
-#define BILLION 1E9
-
 static char *result[] = { "NOT OKAY", "OKAY" };
 static char *irq[] = { "LEGACY", "MSI", "MSI-X" };
 
-- 
cgit v1.2.3


From ac6ad3f3b5b11731312d9a4d3cfda74e22421b14 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 11 Sep 2024 12:37:47 +0300
Subject: selftests: fib_rule_tests: Add DSCP selector match tests

Add tests for the new FIB rule DSCP selector. Test with both IPv4 and
IPv6 and with both input and output routes.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20240911093748.3662015-6-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 34 +++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 53c5c1ad437e..21d11d23fab7 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -274,6 +274,23 @@ fib_rule6_test()
 			"$getnomatch" "ipproto ipv6-icmp match" \
 			"ipproto ipv6-tcp no match"
 	fi
+
+	fib_check_iproute_support "dscp" "tos"
+	if [ $? -eq 0 ]; then
+		match="dscp 0x3f"
+		getmatch="tos 0xfc"
+		getnomatch="tos 0xf4"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "dscp redirect to table" \
+			"dscp no redirect to table"
+
+		match="dscp 0x3f"
+		getmatch="from $SRC_IP6 iif $DEV tos 0xfc"
+		getnomatch="from $SRC_IP6 iif $DEV tos 0xf4"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif dscp redirect to table" \
+			"iif dscp no redirect to table"
+	fi
 }
 
 fib_rule6_vrf_test()
@@ -468,6 +485,23 @@ fib_rule4_test()
 			"$getnomatch" "ipproto icmp match" \
 			"ipproto tcp no match"
 	fi
+
+	fib_check_iproute_support "dscp" "tos"
+	if [ $? -eq 0 ]; then
+		match="dscp 0x3f"
+		getmatch="tos 0xfc"
+		getnomatch="tos 0xf4"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "dscp redirect to table" \
+			"dscp no redirect to table"
+
+		match="dscp 0x3f"
+		getmatch="from $SRC_IP iif $DEV tos 0xfc"
+		getnomatch="from $SRC_IP iif $DEV tos 0xf4"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif dscp redirect to table" \
+			"iif dscp no redirect to table"
+	fi
 }
 
 fib_rule4_vrf_test()
-- 
cgit v1.2.3


From 2bf1259a6ea1104c55d1a5318eec1ef29e85cf76 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 11 Sep 2024 12:37:48 +0300
Subject: selftests: fib_rule_tests: Add DSCP selector connect tests

Test that locally generated traffic from a socket that specifies a DS
Field using the IP_TOS / IPV6_TCLASS socket options is correctly
redirected using a FIB rule that matches on DSCP. Add negative tests to
verify that the rule is not it when it should not. Test with both IPv4
and IPv6 and with both TCP and UDP sockets.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20240911093748.3662015-7-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 56 +++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 21d11d23fab7..1d58b3b87465 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -336,6 +336,34 @@ fib_rule6_connect_test()
 	log_test $? 1 "rule6 dsfield tcp no connect (dsfield 0x20)"
 
 	$IP -6 rule del dsfield 0x04 table $RTABLE_PEER
+
+	ip rule help 2>&1 | grep -q dscp
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 iprule too old, missing dscp match"
+		cleanup_peer
+		return
+	fi
+
+	$IP -6 rule add dscp 0x3f table $RTABLE_PEER
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D -Q 0xfc \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 0 "rule6 dscp udp connect"
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0xfc \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 0 "rule6 dscp tcp connect"
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D -Q 0xf4 \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 1 "rule6 dscp udp no connect"
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0xf4 \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 1 "rule6 dscp tcp no connect"
+
+	$IP -6 rule del dscp 0x3f table $RTABLE_PEER
+
 	cleanup_peer
 }
 
@@ -547,6 +575,34 @@ fib_rule4_connect_test()
 	log_test $? 1 "rule4 dsfield tcp no connect (dsfield 0x20)"
 
 	$IP -4 rule del dsfield 0x04 table $RTABLE_PEER
+
+	ip rule help 2>&1 | grep -q dscp
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 iprule too old, missing dscp match"
+		cleanup_peer
+		return
+	fi
+
+	$IP -4 rule add dscp 0x3f table $RTABLE_PEER
+
+	nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0xfc \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 0 "rule4 dscp udp connect"
+
+	nettest -q -B -t 5 -N $testns -O $peerns -Q 0xfc \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 0 "rule4 dscp tcp connect"
+
+	nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0xf4 \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 1 "rule4 dscp udp no connect"
+
+	nettest -q -B -t 5 -N $testns -O $peerns -Q 0xf4 \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 1 "rule4 dscp tcp no connect"
+
+	$IP -4 rule del dscp 0x3f table $RTABLE_PEER
+
 	cleanup_peer
 }
 ################################################################################
-- 
cgit v1.2.3


From e8adbac0d44fe5f275902c004d04b0cfc33fce8d Mon Sep 17 00:00:00 2001
From: Andrea Parri <parri.andrea@gmail.com>
Date: Wed, 19 Jun 2024 03:06:04 +0200
Subject: tools/memory-model: Document herd7 (abstract) representation

The Linux-kernel memory model (LKMM) source code and the herd7 tool are
closely linked in that the latter is responsible for (pre)processing
each C-like macro of a litmus test, and for providing the LKMM with a
set of events, or "representation", corresponding to the given macro.
This commit therefore provides herd-representation.txt to document
the representations of the concurrency macros, following their
"classification" in Documentation/atomic_t.txt.

Link: https://lore.kernel.org/all/ZnFZPJlILp5B9scN@andrea/

Suggested-by: Hernan Ponce de Leon <hernan.poncedeleon@huaweicloud.com>
Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Hernan Ponce de Leon <hernan.poncedeleon@huaweicloud.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/Documentation/README            |   7 +-
 .../Documentation/herd-representation.txt          | 110 +++++++++++++++++++++
 2 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 tools/memory-model/Documentation/herd-representation.txt

(limited to 'tools')

diff --git a/tools/memory-model/Documentation/README b/tools/memory-model/Documentation/README
index 304162743a5b..44e7dae73b29 100644
--- a/tools/memory-model/Documentation/README
+++ b/tools/memory-model/Documentation/README
@@ -33,7 +33,8 @@ o	You are familiar with Linux-kernel concurrency and the use of
 
 o	You are familiar with Linux-kernel concurrency and the use
 	of LKMM, and would like to learn about LKMM's requirements,
-	rationale, and implementation:	explanation.txt
+	rationale, and implementation:	explanation.txt and
+	herd-representation.txt
 
 o	You are interested in the publications related to LKMM, including
 	hardware manuals, academic literature, standards-committee
@@ -61,6 +62,10 @@ control-dependencies.txt
 explanation.txt
 	Detailed description of the memory model.
 
+herd-representation.txt
+	The (abstract) representation of the Linux-kernel concurrency
+	primitives in terms of events.
+
 litmus-tests.txt
 	The format, features, capabilities, and limitations of the litmus
 	tests that LKMM can evaluate.
diff --git a/tools/memory-model/Documentation/herd-representation.txt b/tools/memory-model/Documentation/herd-representation.txt
new file mode 100644
index 000000000000..ed988906f2b7
--- /dev/null
+++ b/tools/memory-model/Documentation/herd-representation.txt
@@ -0,0 +1,110 @@
+#
+# Legend:
+#	R,	a Load event
+#	W,	a Store event
+#	F,	a Fence event
+#	LKR,	a Lock-Read event
+#	LKW,	a Lock-Write event
+#	UL,	an Unlock event
+#	LF,	a Lock-Fail event
+#	RL,	a Read-Locked event
+#	RU,	a Read-Unlocked event
+#	R*,	a Load event included in RMW
+#	W*,	a Store event included in RMW
+#	SRCU,	a Sleepable-Read-Copy-Update event
+#
+#	po,	a Program-Order link
+#	rmw,	a Read-Modify-Write link - every rmw link is a po link
+#
+# By convention, a blank line in a cell means "same as the preceding line".
+#
+# Disclaimer.  The table includes representations of "add" and "and" operations;
+# corresponding/identical representations of "sub", "inc", "dec" and "or", "xor",
+# "andnot" operations are omitted.
+#
+    ------------------------------------------------------------------------------
+    |                        C macro | Events                                    |
+    ------------------------------------------------------------------------------
+    |                    Non-RMW ops |                                           |
+    ------------------------------------------------------------------------------
+    |                      READ_ONCE | R[once]                                   |
+    |                    atomic_read |                                           |
+    |                     WRITE_ONCE | W[once]                                   |
+    |                     atomic_set |                                           |
+    |               smp_load_acquire | R[acquire]                                |
+    |            atomic_read_acquire |                                           |
+    |              smp_store_release | W[release]                                |
+    |             atomic_set_release |                                           |
+    |                   smp_store_mb | W[once] ->po F[mb]                        |
+    |                         smp_mb | F[mb]                                     |
+    |                        smp_rmb | F[rmb]                                    |
+    |                        smp_wmb | F[wmb]                                    |
+    |          smp_mb__before_atomic | F[before-atomic]                          |
+    |           smp_mb__after_atomic | F[after-atomic]                           |
+    |                    spin_unlock | UL                                        |
+    |                 spin_is_locked | On success: RL                            |
+    |                                | On failure: RU                            |
+    |         smp_mb__after_spinlock | F[after-spinlock]                         |
+    |      smp_mb__after_unlock_lock | F[after-unlock-lock]                      |
+    |                  rcu_read_lock | F[rcu-lock]                               |
+    |                rcu_read_unlock | F[rcu-unlock]                             |
+    |                synchronize_rcu | F[sync-rcu]                               |
+    |                rcu_dereference | R[once]                                   |
+    |             rcu_assign_pointer | W[release]                                |
+    |                 srcu_read_lock | R[srcu-lock]                              |
+    |                 srcu_down_read |                                           |
+    |               srcu_read_unlock | W[srcu-unlock]                            |
+    |                   srcu_up_read |                                           |
+    |               synchronize_srcu | SRCU[sync-srcu]                           |
+    | smp_mb__after_srcu_read_unlock | F[after-srcu-read-unlock]                 |
+    ------------------------------------------------------------------------------
+    |       RMW ops w/o return value |                                           |
+    ------------------------------------------------------------------------------
+    |                     atomic_add | R*[noreturn] ->rmw W*[once]               |
+    |                     atomic_and |                                           |
+    |                      spin_lock | LKR ->po LKW                              |
+    ------------------------------------------------------------------------------
+    |        RMW ops w/ return value |                                           |
+    ------------------------------------------------------------------------------
+    |              atomic_add_return | F[mb] ->po R*[once]                       |
+    |                                |     ->rmw W*[once] ->po F[mb]             |
+    |               atomic_fetch_add |                                           |
+    |               atomic_fetch_and |                                           |
+    |                    atomic_xchg |                                           |
+    |                           xchg |                                           |
+    |            atomic_add_negative |                                           |
+    |      atomic_add_return_relaxed | R*[once] ->rmw W*[once]                   |
+    |       atomic_fetch_add_relaxed |                                           |
+    |       atomic_fetch_and_relaxed |                                           |
+    |            atomic_xchg_relaxed |                                           |
+    |                   xchg_relaxed |                                           |
+    |    atomic_add_negative_relaxed |                                           |
+    |      atomic_add_return_acquire | R*[acquire] ->rmw W*[once]                |
+    |       atomic_fetch_add_acquire |                                           |
+    |       atomic_fetch_and_acquire |                                           |
+    |            atomic_xchg_acquire |                                           |
+    |                   xchg_acquire |                                           |
+    |    atomic_add_negative_acquire |                                           |
+    |      atomic_add_return_release | R*[once] ->rmw W*[release]                |
+    |       atomic_fetch_add_release |                                           |
+    |       atomic_fetch_and_release |                                           |
+    |            atomic_xchg_release |                                           |
+    |                   xchg_release |                                           |
+    |    atomic_add_negative_release |                                           |
+    ------------------------------------------------------------------------------
+    |            Conditional RMW ops |                                           |
+    ------------------------------------------------------------------------------
+    |                 atomic_cmpxchg | On success: F[mb] ->po R*[once]           |
+    |                                |                 ->rmw W*[once] ->po F[mb] |
+    |                                | On failure: R*[once]                      |
+    |                        cmpxchg |                                           |
+    |              atomic_add_unless |                                           |
+    |         atomic_cmpxchg_relaxed | On success: R*[once] ->rmw W*[once]       |
+    |                                | On failure: R*[once]                      |
+    |         atomic_cmpxchg_acquire | On success: R*[acquire] ->rmw W*[once]    |
+    |                                | On failure: R*[once]                      |
+    |         atomic_cmpxchg_release | On success: R*[once] ->rmw W*[release]    |
+    |                                | On failure: R*[once]                      |
+    |                   spin_trylock | On success: LKR ->po LKW                  |
+    |                                | On failure: LF                            |
+    ------------------------------------------------------------------------------
-- 
cgit v1.2.3


From 9bc931e9e161ca9788ebed1d2a88b5aa1b4439b2 Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Tue, 25 Jun 2024 17:58:21 +0900
Subject: tools/memory-model: Add locking.txt and glossary.txt to README

locking.txt and glossary.txt have been in LKMM's documentation for
quite a while.

Add them in README's introduction of docs and the list of docs at the
bottom.  Add access-marking.txt in the former as well.

Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/Documentation/README | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'tools')

diff --git a/tools/memory-model/Documentation/README b/tools/memory-model/Documentation/README
index 44e7dae73b29..9999c1effdb6 100644
--- a/tools/memory-model/Documentation/README
+++ b/tools/memory-model/Documentation/README
@@ -9,6 +9,8 @@ depending on what you know and what you would like to learn.  Please note
 that the documents later in this list assume that the reader understands
 the material provided by documents earlier in this list.
 
+If LKMM-specific terms lost you, glossary.txt might help you.
+
 o	You are new to Linux-kernel concurrency: simple.txt
 
 o	You have some background in Linux-kernel concurrency, and would
@@ -21,6 +23,9 @@ o	You are familiar with the Linux-kernel concurrency primitives
 	that you need, and just want to get started with LKMM litmus
 	tests:  litmus-tests.txt
 
+o	You would like to access lock-protected shared variables without
+	having their corresponding locks held:  locking.txt
+
 o	You are familiar with Linux-kernel concurrency, and would
 	like a detailed intuitive understanding of LKMM, including
 	situations involving more than two threads:  recipes.txt
@@ -28,6 +33,11 @@ o	You are familiar with Linux-kernel concurrency, and would
 o	You would like a detailed understanding of what your compiler can
 	and cannot do to control dependencies:  control-dependencies.txt
 
+o	You would like to mark concurrent normal accesses to shared
+	variables so that intentional "racy" accesses can be properly
+	documented, especially when you are responding to complaints
+	from KCSAN:  access-marking.txt
+
 o	You are familiar with Linux-kernel concurrency and the use of
 	LKMM, and would like a quick reference:  cheatsheet.txt
 
@@ -62,6 +72,9 @@ control-dependencies.txt
 explanation.txt
 	Detailed description of the memory model.
 
+glossary.txt
+	Brief definitions of LKMM-related terms.
+
 herd-representation.txt
 	The (abstract) representation of the Linux-kernel concurrency
 	primitives in terms of events.
@@ -70,6 +83,10 @@ litmus-tests.txt
 	The format, features, capabilities, and limitations of the litmus
 	tests that LKMM can evaluate.
 
+locking.txt
+	Rules for accessing lock-protected shared variables outside of
+	their corresponding critical sections.
+
 ordering.txt
 	Overview of the Linux kernel's low-level memory-ordering
 	primitives by category.
-- 
cgit v1.2.3


From b9a6e87af5eaf4239a11ebc029d59e8ace761f1f Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Tue, 25 Jun 2024 17:59:37 +0900
Subject: tools/memory-model: simple.txt: Fix stale reference to
 recipes-pairs.txt

There has never been recipes-paris.txt at least since v5.11.
Fix the typo.

Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/Documentation/simple.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/memory-model/Documentation/simple.txt b/tools/memory-model/Documentation/simple.txt
index 4c789ec8334f..21f06c1d1b70 100644
--- a/tools/memory-model/Documentation/simple.txt
+++ b/tools/memory-model/Documentation/simple.txt
@@ -266,5 +266,5 @@ More complex use cases
 ======================
 
 If the alternatives above do not do what you need, please look at the
-recipes-pairs.txt file to peel off the next layer of the memory-ordering
+recipes.txt file to peel off the next layer of the memory-ordering
 onion.
-- 
cgit v1.2.3


From 210860e7f7337e47e77577fa5bb168767e2d8a1e Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 15 Sep 2024 01:07:27 +0200
Subject: selftests: vDSO: check cpu caps before running chacha test

Some archs -- arm64 and s390x -- implemented chacha using instructions
that are available most places, but aren't always available. The kernel
handles this just fine, but the selftest does not. Check the hwcaps
before running, and skip the test if the cpu doesn't support it. As
well, on s390x, always emit the fallback instructions of an alternative
block, to ensure maximum compatibility.

Co-developed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 tools/include/asm/alternative.h                 | 10 ++++++++++
 tools/testing/selftests/vDSO/vdso_test_chacha.c | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'tools')

diff --git a/tools/include/asm/alternative.h b/tools/include/asm/alternative.h
index 7ce02a223732..8e548ac8f740 100644
--- a/tools/include/asm/alternative.h
+++ b/tools/include/asm/alternative.h
@@ -2,8 +2,18 @@
 #ifndef _TOOLS_ASM_ALTERNATIVE_ASM_H
 #define _TOOLS_ASM_ALTERNATIVE_ASM_H
 
+#if defined(__s390x__)
+#ifdef __ASSEMBLY__
+.macro ALTERNATIVE oldinstr, newinstr, feature
+	\oldinstr
+.endm
+#endif
+#else
+
 /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
 
 #define ALTERNATIVE #
 
 #endif
+
+#endif
diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c
index e81d72c9882e..b1ea532c5996 100644
--- a/tools/testing/selftests/vDSO/vdso_test_chacha.c
+++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c
@@ -5,11 +5,29 @@
 
 #include <tools/le_byteshift.h>
 #include <sys/random.h>
+#include <sys/auxv.h>
 #include <string.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include "../kselftest.h"
 
+#if defined(__aarch64__)
+static bool cpu_has_capabilities(void)
+{
+	return getauxval(AT_HWCAP) & HWCAP_ASIMD;
+}
+#elif defined(__s390x__)
+static bool cpu_has_capabilities(void)
+{
+	return getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
+}
+#else
+static bool cpu_has_capabilities(void)
+{
+	return true;
+}
+#endif
+
 static uint32_t rol32(uint32_t word, unsigned int shift)
 {
 	return (word << (shift & 31)) | (word >> ((-shift) & 31));
@@ -67,6 +85,8 @@ int main(int argc, char *argv[])
 	uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS];
 
 	ksft_print_header();
+	if (!cpu_has_capabilities())
+		ksft_exit_skip("Required CPU capabilities missing\n");
 	ksft_set_plan(1);
 
 	for (unsigned int trial = 0; trial < TRIALS; ++trial) {
-- 
cgit v1.2.3


From 21d52e295ad2afc76bbd105da82a003b96f6ac77 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Wed, 4 Sep 2024 18:13:55 -0600
Subject: landlock: Add abstract UNIX socket scoping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new "scoped" member to landlock_ruleset_attr that can
specify LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET to restrict connection to
abstract UNIX sockets from a process outside of the socket's domain.

Two hooks are implemented to enforce these restrictions:
unix_stream_connect and unix_may_send.

Closes: https://github.com/landlock-lsm/linux/issues/7
Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/5f7ad85243b78427242275b93481cfc7c127764b.1725494372.git.fahimitahera@gmail.com
[mic: Fix commit message formatting, improve documentation, simplify
hook_unix_may_send(), and cosmetic fixes including rename of
LANDLOCK_SCOPED_ABSTRACT_UNIX_SOCKET]
Co-developed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h                |  27 ++++++
 security/landlock/limits.h                   |   3 +
 security/landlock/ruleset.c                  |   7 +-
 security/landlock/ruleset.h                  |  24 ++++-
 security/landlock/syscalls.c                 |  17 +++-
 security/landlock/task.c                     | 137 +++++++++++++++++++++++++++
 tools/testing/selftests/landlock/base_test.c |   2 +-
 7 files changed, 208 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 2c8dbc74b955..70edd17bafdc 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -44,6 +44,12 @@ struct landlock_ruleset_attr {
 	 * flags`_).
 	 */
 	__u64 handled_access_net;
+	/**
+	 * @scoped: Bitmask of scopes (cf. `Scope flags`_)
+	 * restricting a Landlock domain from accessing outside
+	 * resources (e.g. IPCs).
+	 */
+	__u64 scoped;
 };
 
 /*
@@ -274,4 +280,25 @@ struct landlock_net_port_attr {
 #define LANDLOCK_ACCESS_NET_BIND_TCP			(1ULL << 0)
 #define LANDLOCK_ACCESS_NET_CONNECT_TCP			(1ULL << 1)
 /* clang-format on */
+
+/**
+ * DOC: scope
+ *
+ * Scope flags
+ * ~~~~~~~~~~~
+ *
+ * These flags enable to isolate a sandboxed process from a set of IPC actions.
+ * Setting a flag for a ruleset will isolate the Landlock domain to forbid
+ * connections to resources outside the domain.
+ *
+ * Scopes:
+ *
+ * - %LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: Restrict a sandboxed process from
+ *   connecting to an abstract UNIX socket created by a process outside the
+ *   related Landlock domain (e.g. a parent domain or a non-sandboxed process).
+ */
+/* clang-format off */
+#define LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET		(1ULL << 0)
+/* clang-format on*/
+
 #endif /* _UAPI_LINUX_LANDLOCK_H */
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 4eb643077a2a..d74818003ed4 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -26,6 +26,9 @@
 #define LANDLOCK_MASK_ACCESS_NET	((LANDLOCK_LAST_ACCESS_NET << 1) - 1)
 #define LANDLOCK_NUM_ACCESS_NET		__const_hweight64(LANDLOCK_MASK_ACCESS_NET)
 
+#define LANDLOCK_LAST_SCOPE		LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET
+#define LANDLOCK_MASK_SCOPE		((LANDLOCK_LAST_SCOPE << 1) - 1)
+#define LANDLOCK_NUM_SCOPE		__const_hweight64(LANDLOCK_MASK_SCOPE)
 /* clang-format on */
 
 #endif /* _SECURITY_LANDLOCK_LIMITS_H */
diff --git a/security/landlock/ruleset.c b/security/landlock/ruleset.c
index 6ff232f58618..a93bdbf52fff 100644
--- a/security/landlock/ruleset.c
+++ b/security/landlock/ruleset.c
@@ -52,12 +52,13 @@ static struct landlock_ruleset *create_ruleset(const u32 num_layers)
 
 struct landlock_ruleset *
 landlock_create_ruleset(const access_mask_t fs_access_mask,
-			const access_mask_t net_access_mask)
+			const access_mask_t net_access_mask,
+			const access_mask_t scope_mask)
 {
 	struct landlock_ruleset *new_ruleset;
 
 	/* Informs about useless ruleset. */
-	if (!fs_access_mask && !net_access_mask)
+	if (!fs_access_mask && !net_access_mask && !scope_mask)
 		return ERR_PTR(-ENOMSG);
 	new_ruleset = create_ruleset(1);
 	if (IS_ERR(new_ruleset))
@@ -66,6 +67,8 @@ landlock_create_ruleset(const access_mask_t fs_access_mask,
 		landlock_add_fs_access_mask(new_ruleset, fs_access_mask, 0);
 	if (net_access_mask)
 		landlock_add_net_access_mask(new_ruleset, net_access_mask, 0);
+	if (scope_mask)
+		landlock_add_scope_mask(new_ruleset, scope_mask, 0);
 	return new_ruleset;
 }
 
diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h
index 0f1b5b4c8f6b..61bdbc550172 100644
--- a/security/landlock/ruleset.h
+++ b/security/landlock/ruleset.h
@@ -35,6 +35,8 @@ typedef u16 access_mask_t;
 static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_ACCESS_FS);
 /* Makes sure all network access rights can be stored. */
 static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_ACCESS_NET);
+/* Makes sure all scoped rights can be stored. */
+static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_SCOPE);
 /* Makes sure for_each_set_bit() and for_each_clear_bit() calls are OK. */
 static_assert(sizeof(unsigned long) >= sizeof(access_mask_t));
 
@@ -42,6 +44,7 @@ static_assert(sizeof(unsigned long) >= sizeof(access_mask_t));
 struct access_masks {
 	access_mask_t fs : LANDLOCK_NUM_ACCESS_FS;
 	access_mask_t net : LANDLOCK_NUM_ACCESS_NET;
+	access_mask_t scope : LANDLOCK_NUM_SCOPE;
 };
 
 typedef u16 layer_mask_t;
@@ -233,7 +236,8 @@ struct landlock_ruleset {
 
 struct landlock_ruleset *
 landlock_create_ruleset(const access_mask_t access_mask_fs,
-			const access_mask_t access_mask_net);
+			const access_mask_t access_mask_net,
+			const access_mask_t scope_mask);
 
 void landlock_put_ruleset(struct landlock_ruleset *const ruleset);
 void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset);
@@ -280,6 +284,17 @@ landlock_add_net_access_mask(struct landlock_ruleset *const ruleset,
 	ruleset->access_masks[layer_level].net |= net_mask;
 }
 
+static inline void
+landlock_add_scope_mask(struct landlock_ruleset *const ruleset,
+			const access_mask_t scope_mask, const u16 layer_level)
+{
+	access_mask_t mask = scope_mask & LANDLOCK_MASK_SCOPE;
+
+	/* Should already be checked in sys_landlock_create_ruleset(). */
+	WARN_ON_ONCE(scope_mask != mask);
+	ruleset->access_masks[layer_level].scope |= mask;
+}
+
 static inline access_mask_t
 landlock_get_raw_fs_access_mask(const struct landlock_ruleset *const ruleset,
 				const u16 layer_level)
@@ -303,6 +318,13 @@ landlock_get_net_access_mask(const struct landlock_ruleset *const ruleset,
 	return ruleset->access_masks[layer_level].net;
 }
 
+static inline access_mask_t
+landlock_get_scope_mask(const struct landlock_ruleset *const ruleset,
+			const u16 layer_level)
+{
+	return ruleset->access_masks[layer_level].scope;
+}
+
 bool landlock_unmask_layers(const struct landlock_rule *const rule,
 			    const access_mask_t access_request,
 			    layer_mask_t (*const layer_masks)[],
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index ccc8bc6c1584..c67836841e46 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -97,8 +97,9 @@ static void build_check_abi(void)
 	 */
 	ruleset_size = sizeof(ruleset_attr.handled_access_fs);
 	ruleset_size += sizeof(ruleset_attr.handled_access_net);
+	ruleset_size += sizeof(ruleset_attr.scoped);
 	BUILD_BUG_ON(sizeof(ruleset_attr) != ruleset_size);
-	BUILD_BUG_ON(sizeof(ruleset_attr) != 16);
+	BUILD_BUG_ON(sizeof(ruleset_attr) != 24);
 
 	path_beneath_size = sizeof(path_beneath_attr.allowed_access);
 	path_beneath_size += sizeof(path_beneath_attr.parent_fd);
@@ -149,7 +150,7 @@ static const struct file_operations ruleset_fops = {
 	.write = fop_dummy_write,
 };
 
-#define LANDLOCK_ABI_VERSION 5
+#define LANDLOCK_ABI_VERSION 6
 
 /**
  * sys_landlock_create_ruleset - Create a new ruleset
@@ -170,8 +171,9 @@ static const struct file_operations ruleset_fops = {
  * Possible returned errors are:
  *
  * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
- * - %EINVAL: unknown @flags, or unknown access, or too small @size;
- * - %E2BIG or %EFAULT: @attr or @size inconsistencies;
+ * - %EINVAL: unknown @flags, or unknown access, or unknown scope, or too small @size;
+ * - %E2BIG: @attr or @size inconsistencies;
+ * - %EFAULT: @attr or @size inconsistencies;
  * - %ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
  */
 SYSCALL_DEFINE3(landlock_create_ruleset,
@@ -213,9 +215,14 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
 	    LANDLOCK_MASK_ACCESS_NET)
 		return -EINVAL;
 
+	/* Checks IPC scoping content (and 32-bits cast). */
+	if ((ruleset_attr.scoped | LANDLOCK_MASK_SCOPE) != LANDLOCK_MASK_SCOPE)
+		return -EINVAL;
+
 	/* Checks arguments and transforms to kernel struct. */
 	ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs,
-					  ruleset_attr.handled_access_net);
+					  ruleset_attr.handled_access_net,
+					  ruleset_attr.scoped);
 	if (IS_ERR(ruleset))
 		return PTR_ERR(ruleset);
 
diff --git a/security/landlock/task.c b/security/landlock/task.c
index 849f5123610b..4f8013ca412e 100644
--- a/security/landlock/task.c
+++ b/security/landlock/task.c
@@ -13,6 +13,8 @@
 #include <linux/lsm_hooks.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <net/af_unix.h>
+#include <net/sock.h>
 
 #include "common.h"
 #include "cred.h"
@@ -108,9 +110,144 @@ static int hook_ptrace_traceme(struct task_struct *const parent)
 	return task_ptrace(parent, current);
 }
 
+/**
+ * domain_is_scoped - Checks if the client domain is scoped in the same
+ *		      domain as the server.
+ *
+ * @client: IPC sender domain.
+ * @server: IPC receiver domain.
+ * @scope: The scope restriction criteria.
+ *
+ * Returns: True if the @client domain is scoped to access the @server,
+ * unless the @server is also scoped in the same domain as @client.
+ */
+static bool domain_is_scoped(const struct landlock_ruleset *const client,
+			     const struct landlock_ruleset *const server,
+			     access_mask_t scope)
+{
+	int client_layer, server_layer;
+	struct landlock_hierarchy *client_walker, *server_walker;
+
+	/* Quick return if client has no domain */
+	if (WARN_ON_ONCE(!client))
+		return false;
+
+	client_layer = client->num_layers - 1;
+	client_walker = client->hierarchy;
+	/*
+	 * client_layer must be a signed integer with greater capacity
+	 * than client->num_layers to ensure the following loop stops.
+	 */
+	BUILD_BUG_ON(sizeof(client_layer) > sizeof(client->num_layers));
+
+	server_layer = server ? (server->num_layers - 1) : -1;
+	server_walker = server ? server->hierarchy : NULL;
+
+	/*
+	 * Walks client's parent domains down to the same hierarchy level
+	 * as the server's domain, and checks that none of these client's
+	 * parent domains are scoped.
+	 */
+	for (; client_layer > server_layer; client_layer--) {
+		if (landlock_get_scope_mask(client, client_layer) & scope)
+			return true;
+
+		client_walker = client_walker->parent;
+	}
+	/*
+	 * Walks server's parent domains down to the same hierarchy level as
+	 * the client's domain.
+	 */
+	for (; server_layer > client_layer; server_layer--)
+		server_walker = server_walker->parent;
+
+	for (; client_layer >= 0; client_layer--) {
+		if (landlock_get_scope_mask(client, client_layer) & scope) {
+			/*
+			 * Client and server are at the same level in the
+			 * hierarchy. If the client is scoped, the request is
+			 * only allowed if this domain is also a server's
+			 * ancestor.
+			 */
+			return server_walker != client_walker;
+		}
+		client_walker = client_walker->parent;
+		server_walker = server_walker->parent;
+	}
+	return false;
+}
+
+static bool sock_is_scoped(struct sock *const other,
+			   const struct landlock_ruleset *const domain)
+{
+	const struct landlock_ruleset *dom_other;
+
+	/* The credentials will not change. */
+	lockdep_assert_held(&unix_sk(other)->lock);
+	dom_other = landlock_cred(other->sk_socket->file->f_cred)->domain;
+	return domain_is_scoped(domain, dom_other,
+				LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+}
+
+static bool is_abstract_socket(struct sock *const sock)
+{
+	struct unix_address *addr = unix_sk(sock)->addr;
+
+	if (!addr)
+		return false;
+
+	if (addr->len >= offsetof(struct sockaddr_un, sun_path) + 1 &&
+	    addr->name->sun_path[0] == '\0')
+		return true;
+
+	return false;
+}
+
+static int hook_unix_stream_connect(struct sock *const sock,
+				    struct sock *const other,
+				    struct sock *const newsk)
+{
+	const struct landlock_ruleset *const dom =
+		landlock_get_current_domain();
+
+	/* Quick return for non-landlocked tasks. */
+	if (!dom)
+		return 0;
+
+	if (is_abstract_socket(other) && sock_is_scoped(other, dom))
+		return -EPERM;
+
+	return 0;
+}
+
+static int hook_unix_may_send(struct socket *const sock,
+			      struct socket *const other)
+{
+	const struct landlock_ruleset *const dom =
+		landlock_get_current_domain();
+
+	if (!dom)
+		return 0;
+
+	/*
+	 * Checks if this datagram socket was already allowed to be connected
+	 * to other.
+	 */
+	if (unix_peer(sock->sk) == other->sk)
+		return 0;
+
+	if (is_abstract_socket(other->sk) && sock_is_scoped(other->sk, dom))
+		return -EPERM;
+
+	return 0;
+}
+
 static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme),
+
+	LSM_HOOK_INIT(unix_stream_connect, hook_unix_stream_connect),
+	LSM_HOOK_INIT(unix_may_send, hook_unix_may_send),
 };
 
 __init void landlock_add_task_hooks(void)
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 3b26bf3cf5b9..1bc16fde2e8a 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -76,7 +76,7 @@ TEST(abi_version)
 	const struct landlock_ruleset_attr ruleset_attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
 	};
-	ASSERT_EQ(5, landlock_create_ruleset(NULL, 0,
+	ASSERT_EQ(6, landlock_create_ruleset(NULL, 0,
 					     LANDLOCK_CREATE_RULESET_VERSION));
 
 	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
-- 
cgit v1.2.3


From 5b6b63cd64bbaf9894e799b198cb4562733cab03 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Wed, 4 Sep 2024 18:13:56 -0600
Subject: selftests/landlock: Test handling of unknown scope
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new ruleset_with_unknown_scope test designed to validate the
behaviour of landlock_create_ruleset(2) when called with an unsupported
or unknown scope mask.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/74b363aaa7ddf80e1e5e132ce3d550a3a8bbf6da.1725494372.git.fahimitahera@gmail.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/scoped_test.c | 33 ++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 tools/testing/selftests/landlock/scoped_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/scoped_test.c b/tools/testing/selftests/landlock/scoped_test.c
new file mode 100644
index 000000000000..2d15f09d63e2
--- /dev/null
+++ b/tools/testing/selftests/landlock/scoped_test.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - Common scope restriction
+ *
+ * Copyright © 2024 Tahera Fahimi <fahimitahera@gmail.com>
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/landlock.h>
+#include <sys/prctl.h>
+
+#include "common.h"
+
+#define ACCESS_LAST LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET
+
+TEST(ruleset_with_unknown_scope)
+{
+	__u64 scoped_mask;
+
+	for (scoped_mask = 1ULL << 63; scoped_mask != ACCESS_LAST;
+	     scoped_mask >>= 1) {
+		struct landlock_ruleset_attr ruleset_attr = {
+			.scoped = scoped_mask,
+		};
+
+		ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr,
+						      sizeof(ruleset_attr), 0));
+		ASSERT_EQ(EINVAL, errno);
+	}
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From fefcf0f7cf470845b3675286c298484a056a7b13 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Wed, 4 Sep 2024 18:13:57 -0600
Subject: selftests/landlock: Test abstract UNIX socket scoping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add three tests that examine different scenarios for abstract UNIX
socket:

1) scoped_domains: Base tests of the abstract socket scoping mechanism
   for a landlocked process, same as the ptrace test.

2) scoped_vs_unscoped: Generates three processes with different domains
   and tests if a process with a non-scoped domain can connect to other
   processes.

3) outside_socket: Since the socket's creator credentials are used
   for scoping sockets, this test examines the cases where the socket's
   credentials are different from the process using it.

Move protocol_variant, service_fixture, and sys_gettid() from net_test.c
to common.h, and factor out code into a new set_unix_address() helper.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/9321c3d3bcd9212ceb4b50693e29349f8d625e16.1725494372.git.fahimitahera@gmail.com
[mic: Fix commit message, remove useless clang-format tags, move
drop_caps() calls, move and rename variables, rename variants, use more
EXPECT, improve comments, simplify the outside_socket test]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/common.h          |  37 ++
 tools/testing/selftests/landlock/net_test.c        |  31 +-
 .../selftests/landlock/scoped_abstract_unix_test.c | 624 +++++++++++++++++++++
 .../selftests/landlock/scoped_base_variants.h      | 156 ++++++
 tools/testing/selftests/landlock/scoped_common.h   |  28 +
 .../landlock/scoped_multiple_domain_variants.h     | 152 +++++
 6 files changed, 998 insertions(+), 30 deletions(-)
 create mode 100644 tools/testing/selftests/landlock/scoped_abstract_unix_test.c
 create mode 100644 tools/testing/selftests/landlock/scoped_base_variants.h
 create mode 100644 tools/testing/selftests/landlock/scoped_common.h
 create mode 100644 tools/testing/selftests/landlock/scoped_multiple_domain_variants.h

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 7e2b431b9f90..44552d682ee3 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -7,6 +7,7 @@
  * Copyright © 2021 Microsoft Corporation
  */
 
+#include <arpa/inet.h>
 #include <errno.h>
 #include <linux/landlock.h>
 #include <linux/securebits.h>
@@ -14,6 +15,7 @@
 #include <sys/socket.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
+#include <sys/un.h>
 #include <sys/wait.h>
 #include <unistd.h>
 
@@ -226,3 +228,38 @@ enforce_ruleset(struct __test_metadata *const _metadata, const int ruleset_fd)
 		TH_LOG("Failed to enforce ruleset: %s", strerror(errno));
 	}
 }
+
+struct protocol_variant {
+	int domain;
+	int type;
+};
+
+struct service_fixture {
+	struct protocol_variant protocol;
+	/* port is also stored in ipv4_addr.sin_port or ipv6_addr.sin6_port */
+	unsigned short port;
+	union {
+		struct sockaddr_in ipv4_addr;
+		struct sockaddr_in6 ipv6_addr;
+		struct {
+			struct sockaddr_un unix_addr;
+			socklen_t unix_addr_len;
+		};
+	};
+};
+
+static pid_t __maybe_unused sys_gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+
+static void __maybe_unused set_unix_address(struct service_fixture *const srv,
+					    const unsigned short index)
+{
+	srv->unix_addr.sun_family = AF_UNIX;
+	sprintf(srv->unix_addr.sun_path,
+		"_selftests-landlock-abstract-unix-tid%d-index%d", sys_gettid(),
+		index);
+	srv->unix_addr_len = SUN_LEN(&srv->unix_addr);
+	srv->unix_addr.sun_path[0] = '\0';
+}
diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c
index f21cfbbc3638..4e0aeb53b225 100644
--- a/tools/testing/selftests/landlock/net_test.c
+++ b/tools/testing/selftests/landlock/net_test.c
@@ -36,30 +36,6 @@ enum sandbox_type {
 	TCP_SANDBOX,
 };
 
-struct protocol_variant {
-	int domain;
-	int type;
-};
-
-struct service_fixture {
-	struct protocol_variant protocol;
-	/* port is also stored in ipv4_addr.sin_port or ipv6_addr.sin6_port */
-	unsigned short port;
-	union {
-		struct sockaddr_in ipv4_addr;
-		struct sockaddr_in6 ipv6_addr;
-		struct {
-			struct sockaddr_un unix_addr;
-			socklen_t unix_addr_len;
-		};
-	};
-};
-
-static pid_t sys_gettid(void)
-{
-	return syscall(__NR_gettid);
-}
-
 static int set_service(struct service_fixture *const srv,
 		       const struct protocol_variant prot,
 		       const unsigned short index)
@@ -92,12 +68,7 @@ static int set_service(struct service_fixture *const srv,
 		return 0;
 
 	case AF_UNIX:
-		srv->unix_addr.sun_family = prot.domain;
-		sprintf(srv->unix_addr.sun_path,
-			"_selftests-landlock-net-tid%d-index%d", sys_gettid(),
-			index);
-		srv->unix_addr_len = SUN_LEN(&srv->unix_addr);
-		srv->unix_addr.sun_path[0] = '\0';
+		set_unix_address(srv, index);
 		return 0;
 	}
 	return 1;
diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
new file mode 100644
index 000000000000..11d3ba046328
--- /dev/null
+++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
@@ -0,0 +1,624 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - Abstract UNIX socket
+ *
+ * Copyright © 2024 Tahera Fahimi <fahimitahera@gmail.com>
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/landlock.h>
+#include <sched.h>
+#include <signal.h>
+#include <stddef.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "common.h"
+#include "scoped_common.h"
+
+/* Number of pending connections queue to be hold. */
+const short backlog = 10;
+
+static void create_fs_domain(struct __test_metadata *const _metadata)
+{
+	int ruleset_fd;
+	struct landlock_ruleset_attr ruleset_attr = {
+		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR,
+	};
+
+	ruleset_fd =
+		landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+	EXPECT_LE(0, ruleset_fd)
+	{
+		TH_LOG("Failed to create a ruleset: %s", strerror(errno));
+	}
+	EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+	EXPECT_EQ(0, landlock_restrict_self(ruleset_fd, 0));
+	EXPECT_EQ(0, close(ruleset_fd));
+}
+
+FIXTURE(scoped_domains)
+{
+	struct service_fixture stream_address, dgram_address;
+};
+
+#include "scoped_base_variants.h"
+
+FIXTURE_SETUP(scoped_domains)
+{
+	drop_caps(_metadata);
+
+	memset(&self->stream_address, 0, sizeof(self->stream_address));
+	memset(&self->dgram_address, 0, sizeof(self->dgram_address));
+	set_unix_address(&self->stream_address, 0);
+	set_unix_address(&self->dgram_address, 1);
+}
+
+FIXTURE_TEARDOWN(scoped_domains)
+{
+}
+
+/*
+ * Test unix_stream_connect() and unix_may_send() for a child connecting to its
+ * parent, when they have scoped domain or no domain.
+ */
+TEST_F(scoped_domains, connect_to_parent)
+{
+	pid_t child;
+	bool can_connect_to_parent;
+	int status;
+	int pipe_parent[2];
+	int stream_server, dgram_server;
+
+	/*
+	 * can_connect_to_parent is true if a child process can connect to its
+	 * parent process. This depends on the child process not being isolated
+	 * from the parent with a dedicated Landlock domain.
+	 */
+	can_connect_to_parent = !variant->domain_child;
+
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+	if (variant->domain_both) {
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+		if (!__test_passed(_metadata))
+			return;
+	}
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		int err;
+		int stream_client, dgram_client;
+		char buf_child;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		if (variant->domain_child)
+			create_scoped_domain(
+				_metadata, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+		stream_client = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, stream_client);
+		dgram_client = socket(AF_UNIX, SOCK_DGRAM, 0);
+		ASSERT_LE(0, dgram_client);
+
+		/* Waits for the server. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1));
+
+		err = connect(stream_client, &self->stream_address.unix_addr,
+			      self->stream_address.unix_addr_len);
+		if (can_connect_to_parent) {
+			EXPECT_EQ(0, err);
+		} else {
+			EXPECT_EQ(-1, err);
+			EXPECT_EQ(EPERM, errno);
+		}
+		EXPECT_EQ(0, close(stream_client));
+
+		err = connect(dgram_client, &self->dgram_address.unix_addr,
+			      self->dgram_address.unix_addr_len);
+		if (can_connect_to_parent) {
+			EXPECT_EQ(0, err);
+		} else {
+			EXPECT_EQ(-1, err);
+			EXPECT_EQ(EPERM, errno);
+		}
+		EXPECT_EQ(0, close(dgram_client));
+		_exit(_metadata->exit_code);
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_parent[0]));
+	if (variant->domain_parent)
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+	stream_server = socket(AF_UNIX, SOCK_STREAM, 0);
+	ASSERT_LE(0, stream_server);
+	dgram_server = socket(AF_UNIX, SOCK_DGRAM, 0);
+	ASSERT_LE(0, dgram_server);
+	ASSERT_EQ(0, bind(stream_server, &self->stream_address.unix_addr,
+			  self->stream_address.unix_addr_len));
+	ASSERT_EQ(0, bind(dgram_server, &self->dgram_address.unix_addr,
+			  self->dgram_address.unix_addr_len));
+	ASSERT_EQ(0, listen(stream_server, backlog));
+
+	/* Signals to child that the parent is listening. */
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	EXPECT_EQ(0, close(stream_server));
+	EXPECT_EQ(0, close(dgram_server));
+
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
+/*
+ * Test unix_stream_connect() and unix_may_send() for a parent connecting to
+ * its child, when they have scoped domain or no domain.
+ */
+TEST_F(scoped_domains, connect_to_child)
+{
+	pid_t child;
+	bool can_connect_to_child;
+	int err_stream, err_dgram, errno_stream, errno_dgram, status;
+	int pipe_child[2], pipe_parent[2];
+	char buf;
+	int stream_client, dgram_client;
+
+	/*
+	 * can_connect_to_child is true if a parent process can connect to its
+	 * child process. The parent process is not isolated from the child
+	 * with a dedicated Landlock domain.
+	 */
+	can_connect_to_child = !variant->domain_parent;
+
+	ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC));
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+	if (variant->domain_both) {
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+		if (!__test_passed(_metadata))
+			return;
+	}
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		int stream_server, dgram_server;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		EXPECT_EQ(0, close(pipe_child[0]));
+		if (variant->domain_child)
+			create_scoped_domain(
+				_metadata, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+		/* Waits for the parent to be in a domain, if any. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf, 1));
+
+		stream_server = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, stream_server);
+		dgram_server = socket(AF_UNIX, SOCK_DGRAM, 0);
+		ASSERT_LE(0, dgram_server);
+		ASSERT_EQ(0,
+			  bind(stream_server, &self->stream_address.unix_addr,
+			       self->stream_address.unix_addr_len));
+		ASSERT_EQ(0, bind(dgram_server, &self->dgram_address.unix_addr,
+				  self->dgram_address.unix_addr_len));
+		ASSERT_EQ(0, listen(stream_server, backlog));
+
+		/* Signals to the parent that child is listening. */
+		ASSERT_EQ(1, write(pipe_child[1], ".", 1));
+
+		/* Waits to connect. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf, 1));
+		EXPECT_EQ(0, close(stream_server));
+		EXPECT_EQ(0, close(dgram_server));
+		_exit(_metadata->exit_code);
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_child[1]));
+	EXPECT_EQ(0, close(pipe_parent[0]));
+
+	if (variant->domain_parent)
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+	/* Signals that the parent is in a domain, if any. */
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+
+	stream_client = socket(AF_UNIX, SOCK_STREAM, 0);
+	ASSERT_LE(0, stream_client);
+	dgram_client = socket(AF_UNIX, SOCK_DGRAM, 0);
+	ASSERT_LE(0, dgram_client);
+
+	/* Waits for the child to listen */
+	ASSERT_EQ(1, read(pipe_child[0], &buf, 1));
+	err_stream = connect(stream_client, &self->stream_address.unix_addr,
+			     self->stream_address.unix_addr_len);
+	errno_stream = errno;
+	err_dgram = connect(dgram_client, &self->dgram_address.unix_addr,
+			    self->dgram_address.unix_addr_len);
+	errno_dgram = errno;
+	if (can_connect_to_child) {
+		EXPECT_EQ(0, err_stream);
+		EXPECT_EQ(0, err_dgram);
+	} else {
+		EXPECT_EQ(-1, err_stream);
+		EXPECT_EQ(-1, err_dgram);
+		EXPECT_EQ(EPERM, errno_stream);
+		EXPECT_EQ(EPERM, errno_dgram);
+	}
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+	EXPECT_EQ(0, close(stream_client));
+	EXPECT_EQ(0, close(dgram_client));
+
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
+FIXTURE(scoped_vs_unscoped)
+{
+	struct service_fixture parent_stream_address, parent_dgram_address,
+		child_stream_address, child_dgram_address;
+};
+
+#include "scoped_multiple_domain_variants.h"
+
+FIXTURE_SETUP(scoped_vs_unscoped)
+{
+	drop_caps(_metadata);
+
+	memset(&self->parent_stream_address, 0,
+	       sizeof(self->parent_stream_address));
+	set_unix_address(&self->parent_stream_address, 0);
+	memset(&self->parent_dgram_address, 0,
+	       sizeof(self->parent_dgram_address));
+	set_unix_address(&self->parent_dgram_address, 1);
+	memset(&self->child_stream_address, 0,
+	       sizeof(self->child_stream_address));
+	set_unix_address(&self->child_stream_address, 2);
+	memset(&self->child_dgram_address, 0,
+	       sizeof(self->child_dgram_address));
+	set_unix_address(&self->child_dgram_address, 3);
+}
+
+FIXTURE_TEARDOWN(scoped_vs_unscoped)
+{
+}
+
+/*
+ * Test unix_stream_connect and unix_may_send for parent, child and
+ * grand child processes when they can have scoped or non-scoped domains.
+ */
+TEST_F(scoped_vs_unscoped, unix_scoping)
+{
+	pid_t child;
+	int status;
+	bool can_connect_to_parent, can_connect_to_child;
+	int pipe_parent[2];
+	int stream_server_parent, dgram_server_parent;
+
+	can_connect_to_child = (variant->domain_grand_child != SCOPE_SANDBOX);
+	can_connect_to_parent = (can_connect_to_child &&
+				 (variant->domain_children != SCOPE_SANDBOX));
+
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+
+	if (variant->domain_all == OTHER_SANDBOX)
+		create_fs_domain(_metadata);
+	else if (variant->domain_all == SCOPE_SANDBOX)
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		int stream_server_child, dgram_server_child;
+		int pipe_child[2];
+		pid_t grand_child;
+
+		ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC));
+
+		if (variant->domain_children == OTHER_SANDBOX)
+			create_fs_domain(_metadata);
+		else if (variant->domain_children == SCOPE_SANDBOX)
+			create_scoped_domain(
+				_metadata, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+		grand_child = fork();
+		ASSERT_LE(0, grand_child);
+		if (grand_child == 0) {
+			char buf;
+			int stream_err, dgram_err, stream_errno, dgram_errno;
+			int stream_client, dgram_client;
+
+			EXPECT_EQ(0, close(pipe_parent[1]));
+			EXPECT_EQ(0, close(pipe_child[1]));
+
+			if (variant->domain_grand_child == OTHER_SANDBOX)
+				create_fs_domain(_metadata);
+			else if (variant->domain_grand_child == SCOPE_SANDBOX)
+				create_scoped_domain(
+					_metadata,
+					LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+			stream_client = socket(AF_UNIX, SOCK_STREAM, 0);
+			ASSERT_LE(0, stream_client);
+			dgram_client = socket(AF_UNIX, SOCK_DGRAM, 0);
+			ASSERT_LE(0, dgram_client);
+
+			ASSERT_EQ(1, read(pipe_child[0], &buf, 1));
+			stream_err = connect(
+				stream_client,
+				&self->child_stream_address.unix_addr,
+				self->child_stream_address.unix_addr_len);
+			stream_errno = errno;
+			dgram_err = connect(
+				dgram_client,
+				&self->child_dgram_address.unix_addr,
+				self->child_dgram_address.unix_addr_len);
+			dgram_errno = errno;
+			if (can_connect_to_child) {
+				EXPECT_EQ(0, stream_err);
+				EXPECT_EQ(0, dgram_err);
+			} else {
+				EXPECT_EQ(-1, stream_err);
+				EXPECT_EQ(-1, dgram_err);
+				EXPECT_EQ(EPERM, stream_errno);
+				EXPECT_EQ(EPERM, dgram_errno);
+			}
+
+			EXPECT_EQ(0, close(stream_client));
+			stream_client = socket(AF_UNIX, SOCK_STREAM, 0);
+			ASSERT_LE(0, stream_client);
+			/* Datagram sockets can "reconnect". */
+
+			ASSERT_EQ(1, read(pipe_parent[0], &buf, 1));
+			stream_err = connect(
+				stream_client,
+				&self->parent_stream_address.unix_addr,
+				self->parent_stream_address.unix_addr_len);
+			stream_errno = errno;
+			dgram_err = connect(
+				dgram_client,
+				&self->parent_dgram_address.unix_addr,
+				self->parent_dgram_address.unix_addr_len);
+			dgram_errno = errno;
+			if (can_connect_to_parent) {
+				EXPECT_EQ(0, stream_err);
+				EXPECT_EQ(0, dgram_err);
+			} else {
+				EXPECT_EQ(-1, stream_err);
+				EXPECT_EQ(-1, dgram_err);
+				EXPECT_EQ(EPERM, stream_errno);
+				EXPECT_EQ(EPERM, dgram_errno);
+			}
+			EXPECT_EQ(0, close(stream_client));
+			EXPECT_EQ(0, close(dgram_client));
+
+			_exit(_metadata->exit_code);
+			return;
+		}
+		EXPECT_EQ(0, close(pipe_child[0]));
+		if (variant->domain_child == OTHER_SANDBOX)
+			create_fs_domain(_metadata);
+		else if (variant->domain_child == SCOPE_SANDBOX)
+			create_scoped_domain(
+				_metadata, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+		stream_server_child = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, stream_server_child);
+		dgram_server_child = socket(AF_UNIX, SOCK_DGRAM, 0);
+		ASSERT_LE(0, dgram_server_child);
+
+		ASSERT_EQ(0, bind(stream_server_child,
+				  &self->child_stream_address.unix_addr,
+				  self->child_stream_address.unix_addr_len));
+		ASSERT_EQ(0, bind(dgram_server_child,
+				  &self->child_dgram_address.unix_addr,
+				  self->child_dgram_address.unix_addr_len));
+		ASSERT_EQ(0, listen(stream_server_child, backlog));
+
+		ASSERT_EQ(1, write(pipe_child[1], ".", 1));
+		ASSERT_EQ(grand_child, waitpid(grand_child, &status, 0));
+		EXPECT_EQ(0, close(stream_server_child))
+		EXPECT_EQ(0, close(dgram_server_child));
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_parent[0]));
+
+	if (variant->domain_parent == OTHER_SANDBOX)
+		create_fs_domain(_metadata);
+	else if (variant->domain_parent == SCOPE_SANDBOX)
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+	stream_server_parent = socket(AF_UNIX, SOCK_STREAM, 0);
+	ASSERT_LE(0, stream_server_parent);
+	dgram_server_parent = socket(AF_UNIX, SOCK_DGRAM, 0);
+	ASSERT_LE(0, dgram_server_parent);
+	ASSERT_EQ(0, bind(stream_server_parent,
+			  &self->parent_stream_address.unix_addr,
+			  self->parent_stream_address.unix_addr_len));
+	ASSERT_EQ(0, bind(dgram_server_parent,
+			  &self->parent_dgram_address.unix_addr,
+			  self->parent_dgram_address.unix_addr_len));
+
+	ASSERT_EQ(0, listen(stream_server_parent, backlog));
+
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	EXPECT_EQ(0, close(stream_server_parent));
+	EXPECT_EQ(0, close(dgram_server_parent));
+
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
+FIXTURE(outside_socket)
+{
+	struct service_fixture address, transit_address;
+};
+
+FIXTURE_VARIANT(outside_socket)
+{
+	const bool child_socket;
+	const int type;
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(outside_socket, allow_dgram_child) {
+	/* clang-format on */
+	.child_socket = true,
+	.type = SOCK_DGRAM,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(outside_socket, deny_dgram_server) {
+	/* clang-format on */
+	.child_socket = false,
+	.type = SOCK_DGRAM,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(outside_socket, allow_stream_child) {
+	/* clang-format on */
+	.child_socket = true,
+	.type = SOCK_STREAM,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(outside_socket, deny_stream_server) {
+	/* clang-format on */
+	.child_socket = false,
+	.type = SOCK_STREAM,
+};
+
+FIXTURE_SETUP(outside_socket)
+{
+	drop_caps(_metadata);
+
+	memset(&self->transit_address, 0, sizeof(self->transit_address));
+	set_unix_address(&self->transit_address, 0);
+	memset(&self->address, 0, sizeof(self->address));
+	set_unix_address(&self->address, 1);
+}
+
+FIXTURE_TEARDOWN(outside_socket)
+{
+}
+
+/*
+ * Test unix_stream_connect and unix_may_send for parent and child processes
+ * when connecting socket has different domain than the process using it.
+ */
+TEST_F(outside_socket, socket_with_different_domain)
+{
+	pid_t child;
+	int err, status;
+	int pipe_child[2], pipe_parent[2];
+	char buf_parent;
+	int server_socket;
+
+	ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC));
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		int client_socket;
+		char buf_child;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		EXPECT_EQ(0, close(pipe_child[0]));
+
+		/* Client always has a domain. */
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+		if (variant->child_socket) {
+			int data_socket, passed_socket, stream_server;
+
+			passed_socket = socket(AF_UNIX, variant->type, 0);
+			ASSERT_LE(0, passed_socket);
+			stream_server = socket(AF_UNIX, SOCK_STREAM, 0);
+			ASSERT_LE(0, stream_server);
+			ASSERT_EQ(0, bind(stream_server,
+					  &self->transit_address.unix_addr,
+					  self->transit_address.unix_addr_len));
+			ASSERT_EQ(0, listen(stream_server, backlog));
+			ASSERT_EQ(1, write(pipe_child[1], ".", 1));
+			data_socket = accept(stream_server, NULL, NULL);
+			ASSERT_LE(0, data_socket);
+			ASSERT_EQ(0, send_fd(data_socket, passed_socket));
+			EXPECT_EQ(0, close(passed_socket));
+			EXPECT_EQ(0, close(stream_server));
+		}
+
+		client_socket = socket(AF_UNIX, variant->type, 0);
+		ASSERT_LE(0, client_socket);
+
+		/* Waits for parent signal for connection. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1));
+		err = connect(client_socket, &self->address.unix_addr,
+			      self->address.unix_addr_len);
+		if (variant->child_socket) {
+			EXPECT_EQ(0, err);
+		} else {
+			EXPECT_EQ(-1, err);
+			EXPECT_EQ(EPERM, errno);
+		}
+		EXPECT_EQ(0, close(client_socket));
+		_exit(_metadata->exit_code);
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_child[1]));
+	EXPECT_EQ(0, close(pipe_parent[0]));
+
+	if (variant->child_socket) {
+		int client_child = socket(AF_UNIX, SOCK_STREAM, 0);
+
+		ASSERT_LE(0, client_child);
+		ASSERT_EQ(1, read(pipe_child[0], &buf_parent, 1));
+		ASSERT_EQ(0, connect(client_child,
+				     &self->transit_address.unix_addr,
+				     self->transit_address.unix_addr_len));
+		server_socket = recv_fd(client_child);
+		EXPECT_EQ(0, close(client_child));
+	} else {
+		server_socket = socket(AF_UNIX, variant->type, 0);
+	}
+	ASSERT_LE(0, server_socket);
+
+	/* Server always has a domain. */
+	create_scoped_domain(_metadata, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+	ASSERT_EQ(0, bind(server_socket, &self->address.unix_addr,
+			  self->address.unix_addr_len));
+	if (variant->type == SOCK_STREAM)
+		ASSERT_EQ(0, listen(server_socket, backlog));
+
+	/* Signals to child that the parent is listening. */
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	EXPECT_EQ(0, close(server_socket));
+
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/landlock/scoped_base_variants.h b/tools/testing/selftests/landlock/scoped_base_variants.h
new file mode 100644
index 000000000000..d3b1fa8a584e
--- /dev/null
+++ b/tools/testing/selftests/landlock/scoped_base_variants.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Landlock scoped_domains variants
+ *
+ * See the hierarchy variants from ptrace_test.c
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019-2020 ANSSI
+ * Copyright © 2024 Tahera Fahimi <fahimitahera@gmail.com>
+ */
+
+/* clang-format on */
+FIXTURE_VARIANT(scoped_domains)
+{
+	bool domain_both;
+	bool domain_parent;
+	bool domain_child;
+};
+
+/*
+ *        No domain
+ *
+ *   P1-.               P1 -> P2 : allow
+ *       \              P2 -> P1 : allow
+ *        'P2
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_domains, without_domain) {
+	/* clang-format on */
+	.domain_both = false,
+	.domain_parent = false,
+	.domain_child = false,
+};
+
+/*
+ *        Child domain
+ *
+ *   P1--.              P1 -> P2 : allow
+ *        \             P2 -> P1 : deny
+ *        .'-----.
+ *        |  P2  |
+ *        '------'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_domains, child_domain) {
+	/* clang-format on */
+	.domain_both = false,
+	.domain_parent = false,
+	.domain_child = true,
+};
+
+/*
+ *        Parent domain
+ * .------.
+ * |  P1  --.           P1 -> P2 : deny
+ * '------'  \          P2 -> P1 : allow
+ *            '
+ *            P2
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_domains, parent_domain) {
+	/* clang-format on */
+	.domain_both = false,
+	.domain_parent = true,
+	.domain_child = false,
+};
+
+/*
+ *        Parent + child domain (siblings)
+ * .------.
+ * |  P1  ---.          P1 -> P2 : deny
+ * '------'   \         P2 -> P1 : deny
+ *         .---'--.
+ *         |  P2  |
+ *         '------'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_domains, sibling_domain) {
+	/* clang-format on */
+	.domain_both = false,
+	.domain_parent = true,
+	.domain_child = true,
+};
+
+/*
+ *         Same domain (inherited)
+ * .-------------.
+ * | P1----.     |      P1 -> P2 : allow
+ * |        \    |      P2 -> P1 : allow
+ * |         '   |
+ * |         P2  |
+ * '-------------'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_domains, inherited_domain) {
+	/* clang-format on */
+	.domain_both = true,
+	.domain_parent = false,
+	.domain_child = false,
+};
+
+/*
+ *         Inherited + child domain
+ * .-----------------.
+ * |  P1----.        |  P1 -> P2 : allow
+ * |         \       |  P2 -> P1 : deny
+ * |        .-'----. |
+ * |        |  P2  | |
+ * |        '------' |
+ * '-----------------'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_domains, nested_domain) {
+	/* clang-format on */
+	.domain_both = true,
+	.domain_parent = false,
+	.domain_child = true,
+};
+
+/*
+ *         Inherited + parent domain
+ * .-----------------.
+ * |.------.         |  P1 -> P2 : deny
+ * ||  P1  ----.     |  P2 -> P1 : allow
+ * |'------'    \    |
+ * |             '   |
+ * |             P2  |
+ * '-----------------'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_domains, nested_and_parent_domain) {
+	/* clang-format on */
+	.domain_both = true,
+	.domain_parent = true,
+	.domain_child = false,
+};
+
+/*
+ *         Inherited + parent and child domain (siblings)
+ * .-----------------.
+ * | .------.        |  P1 -> P2 : deny
+ * | |  P1  .        |  P2 -> P1 : deny
+ * | '------'\       |
+ * |          \      |
+ * |        .--'---. |
+ * |        |  P2  | |
+ * |        '------' |
+ * '-----------------'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_domains, forked_domains) {
+	/* clang-format on */
+	.domain_both = true,
+	.domain_parent = true,
+	.domain_child = true,
+};
diff --git a/tools/testing/selftests/landlock/scoped_common.h b/tools/testing/selftests/landlock/scoped_common.h
new file mode 100644
index 000000000000..a9a912d30c4d
--- /dev/null
+++ b/tools/testing/selftests/landlock/scoped_common.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Landlock scope test helpers
+ *
+ * Copyright © 2024 Tahera Fahimi <fahimitahera@gmail.com>
+ */
+
+#define _GNU_SOURCE
+
+#include <sys/types.h>
+
+static void create_scoped_domain(struct __test_metadata *const _metadata,
+				 const __u16 scope)
+{
+	int ruleset_fd;
+	const struct landlock_ruleset_attr ruleset_attr = {
+		.scoped = scope,
+	};
+
+	ruleset_fd =
+		landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+	ASSERT_LE(0, ruleset_fd)
+	{
+		TH_LOG("Failed to create a ruleset: %s", strerror(errno));
+	}
+	enforce_ruleset(_metadata, ruleset_fd);
+	EXPECT_EQ(0, close(ruleset_fd));
+}
diff --git a/tools/testing/selftests/landlock/scoped_multiple_domain_variants.h b/tools/testing/selftests/landlock/scoped_multiple_domain_variants.h
new file mode 100644
index 000000000000..bcd9a83805d0
--- /dev/null
+++ b/tools/testing/selftests/landlock/scoped_multiple_domain_variants.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Landlock variants for three processes with various domains.
+ *
+ * Copyright © 2024 Tahera Fahimi <fahimitahera@gmail.com>
+ */
+
+enum sandbox_type {
+	NO_SANDBOX,
+	SCOPE_SANDBOX,
+	/* Any other type of sandboxing domain */
+	OTHER_SANDBOX,
+};
+
+/* clang-format on */
+FIXTURE_VARIANT(scoped_vs_unscoped)
+{
+	const int domain_all;
+	const int domain_parent;
+	const int domain_children;
+	const int domain_child;
+	const int domain_grand_child;
+};
+
+/*
+ * .-----------------.
+ * |         ####### |  P3 -> P2 : allow
+ * |   P1----# P2  # |  P3 -> P1 : deny
+ * |         #  |  # |
+ * |         # P3  # |
+ * |         ####### |
+ * '-----------------'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_vs_unscoped, deny_scoped) {
+	.domain_all = OTHER_SANDBOX,
+	.domain_parent = NO_SANDBOX,
+	.domain_children = SCOPE_SANDBOX,
+	.domain_child = NO_SANDBOX,
+	.domain_grand_child = NO_SANDBOX,
+	/* clang-format on */
+};
+
+/*
+ * ###################
+ * #         ####### #  P3 -> P2 : allow
+ * #   P1----# P2  # #  P3 -> P1 : deny
+ * #         #  |  # #
+ * #         # P3  # #
+ * #         ####### #
+ * ###################
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_vs_unscoped, all_scoped) {
+	.domain_all = SCOPE_SANDBOX,
+	.domain_parent = NO_SANDBOX,
+	.domain_children = SCOPE_SANDBOX,
+	.domain_child = NO_SANDBOX,
+	.domain_grand_child = NO_SANDBOX,
+	/* clang-format on */
+};
+
+/*
+ * .-----------------.
+ * |         .-----. |  P3 -> P2 : allow
+ * |   P1----| P2  | |  P3 -> P1 : allow
+ * |         |     | |
+ * |         | P3  | |
+ * |         '-----' |
+ * '-----------------'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_vs_unscoped, allow_with_other_domain) {
+	.domain_all = OTHER_SANDBOX,
+	.domain_parent = NO_SANDBOX,
+	.domain_children = OTHER_SANDBOX,
+	.domain_child = NO_SANDBOX,
+	.domain_grand_child = NO_SANDBOX,
+	/* clang-format on */
+};
+
+/*
+ *  .----.    ######   P3 -> P2 : allow
+ *  | P1 |----# P2 #   P3 -> P1 : allow
+ *  '----'    ######
+ *              |
+ *              P3
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_vs_unscoped, allow_with_one_domain) {
+	.domain_all = NO_SANDBOX,
+	.domain_parent = OTHER_SANDBOX,
+	.domain_children = NO_SANDBOX,
+	.domain_child = SCOPE_SANDBOX,
+	.domain_grand_child = NO_SANDBOX,
+	/* clang-format on */
+};
+
+/*
+ *  ######    .-----.   P3 -> P2 : allow
+ *  # P1 #----| P2  |   P3 -> P1 : allow
+ *  ######    '-----'
+ *              |
+ *              P3
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_vs_unscoped, allow_with_grand_parent_scoped) {
+	.domain_all = NO_SANDBOX,
+	.domain_parent = SCOPE_SANDBOX,
+	.domain_children = NO_SANDBOX,
+	.domain_child = OTHER_SANDBOX,
+	.domain_grand_child = NO_SANDBOX,
+	/* clang-format on */
+};
+
+/*
+ *  ######    ######   P3 -> P2 : allow
+ *  # P1 #----# P2 #   P3 -> P1 : allow
+ *  ######    ######
+ *               |
+ *             .----.
+ *             | P3 |
+ *             '----'
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_vs_unscoped, allow_with_parents_domain) {
+	.domain_all = NO_SANDBOX,
+	.domain_parent = SCOPE_SANDBOX,
+	.domain_children = NO_SANDBOX,
+	.domain_child = SCOPE_SANDBOX,
+	.domain_grand_child = NO_SANDBOX,
+	/* clang-format on */
+};
+
+/*
+ *  ######		P3 -> P2 : deny
+ *  # P1 #----P2	P3 -> P1 : deny
+ *  ######     |
+ *	       |
+ *	     ######
+ *           # P3 #
+ *           ######
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoped_vs_unscoped, deny_with_self_and_grandparent_domain) {
+	.domain_all = NO_SANDBOX,
+	.domain_parent = SCOPE_SANDBOX,
+	.domain_children = NO_SANDBOX,
+	.domain_child = NO_SANDBOX,
+	.domain_grand_child = SCOPE_SANDBOX,
+	/* clang-format on */
+};
-- 
cgit v1.2.3


From 4f9a5b50d3b39027c5fdb44b33835209665fe069 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Wed, 4 Sep 2024 18:13:58 -0600
Subject: selftests/landlock: Test UNIX sockets with any address formats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expand abstract UNIX socket restriction tests by examining different
scenarios for UNIX sockets with pathname or unnamed address formats
connection with scoped domain.

The various_address_sockets tests ensure that UNIX sockets bound to a
filesystem pathname and unnamed sockets created by socketpair can still
connect to a socket outside of their scoped domain, meaning that even if
the domain is scoped with LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET, the
socket can connect to a socket outside the scoped domain.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/a9e8016aaa5846252623b158c8f1ce0d666944f4.1725494372.git.fahimitahera@gmail.com
[mic: Remove useless clang-format tags, fix unlink/rmdir calls, drop
capabilities, rename variables, remove useless mknod/unlink calls, clean
up fixture, test write/read on sockets, test sendto() on datagram
sockets, close sockets as soon as possible]
Co-developed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/common.h          |   2 +
 tools/testing/selftests/landlock/fs_test.c         |   1 -
 .../selftests/landlock/scoped_abstract_unix_test.c | 250 +++++++++++++++++++++
 3 files changed, 252 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 44552d682ee3..61056fa074bb 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -21,6 +21,8 @@
 
 #include "../kselftest_harness.h"
 
+#define TMP_DIR "tmp"
+
 #ifndef __maybe_unused
 #define __maybe_unused __attribute__((__unused__))
 #endif
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 7d063c652be1..6788762188fe 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -59,7 +59,6 @@ int open_tree(int dfd, const char *filename, unsigned int flags)
 #define RENAME_EXCHANGE (1 << 1)
 #endif
 
-#define TMP_DIR "tmp"
 #define BINARY_PATH "./true"
 
 /* Paths (sibling number and depth) */
diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
index 11d3ba046328..553806c3e27d 100644
--- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
+++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
@@ -621,4 +621,254 @@ TEST_F(outside_socket, socket_with_different_domain)
 		_metadata->exit_code = KSFT_FAIL;
 }
 
+static const char stream_path[] = TMP_DIR "/stream.sock";
+static const char dgram_path[] = TMP_DIR "/dgram.sock";
+
+/* clang-format off */
+FIXTURE(various_address_sockets) {};
+/* clang-format on */
+
+FIXTURE_VARIANT(various_address_sockets)
+{
+	const int domain;
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(various_address_sockets, pathname_socket_scoped_domain) {
+	/* clang-format on */
+	.domain = SCOPE_SANDBOX,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(various_address_sockets, pathname_socket_other_domain) {
+	/* clang-format on */
+	.domain = OTHER_SANDBOX,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(various_address_sockets, pathname_socket_no_domain) {
+	/* clang-format on */
+	.domain = NO_SANDBOX,
+};
+
+FIXTURE_SETUP(various_address_sockets)
+{
+	drop_caps(_metadata);
+
+	umask(0077);
+	ASSERT_EQ(0, mkdir(TMP_DIR, 0700));
+}
+
+FIXTURE_TEARDOWN(various_address_sockets)
+{
+	EXPECT_EQ(0, unlink(stream_path));
+	EXPECT_EQ(0, unlink(dgram_path));
+	EXPECT_EQ(0, rmdir(TMP_DIR));
+}
+
+TEST_F(various_address_sockets, scoped_pathname_sockets)
+{
+	socklen_t size_stream, size_dgram;
+	pid_t child;
+	int status;
+	char buf_child, buf_parent;
+	int pipe_parent[2];
+	int unnamed_sockets[2];
+	int stream_pathname_socket, dgram_pathname_socket,
+		stream_abstract_socket, dgram_abstract_socket, data_socket;
+	struct service_fixture stream_abstract_addr, dgram_abstract_addr;
+	struct sockaddr_un stream_pathname_addr = {
+		.sun_family = AF_UNIX,
+	};
+	struct sockaddr_un dgram_pathname_addr = {
+		.sun_family = AF_UNIX,
+	};
+
+	/* Pathname address. */
+	snprintf(stream_pathname_addr.sun_path,
+		 sizeof(stream_pathname_addr.sun_path), "%s", stream_path);
+	size_stream = offsetof(struct sockaddr_un, sun_path) +
+		      strlen(stream_pathname_addr.sun_path);
+	snprintf(dgram_pathname_addr.sun_path,
+		 sizeof(dgram_pathname_addr.sun_path), "%s", dgram_path);
+	size_dgram = offsetof(struct sockaddr_un, sun_path) +
+		     strlen(dgram_pathname_addr.sun_path);
+
+	/* Abstract address. */
+	memset(&stream_abstract_addr, 0, sizeof(stream_abstract_addr));
+	set_unix_address(&stream_abstract_addr, 0);
+	memset(&dgram_abstract_addr, 0, sizeof(dgram_abstract_addr));
+	set_unix_address(&dgram_abstract_addr, 1);
+
+	/* Unnamed address for datagram socket. */
+	ASSERT_EQ(0, socketpair(AF_UNIX, SOCK_DGRAM, 0, unnamed_sockets));
+
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		int err;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		EXPECT_EQ(0, close(unnamed_sockets[1]));
+
+		if (variant->domain == SCOPE_SANDBOX)
+			create_scoped_domain(
+				_metadata, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+		else if (variant->domain == OTHER_SANDBOX)
+			create_fs_domain(_metadata);
+
+		/* Waits for parent to listen. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1));
+		EXPECT_EQ(0, close(pipe_parent[0]));
+
+		/* Checks that we can send data through a datagram socket. */
+		ASSERT_EQ(1, write(unnamed_sockets[0], "a", 1));
+		EXPECT_EQ(0, close(unnamed_sockets[0]));
+
+		/* Connects with pathname sockets. */
+		stream_pathname_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, stream_pathname_socket);
+		ASSERT_EQ(0, connect(stream_pathname_socket,
+				     &stream_pathname_addr, size_stream));
+		ASSERT_EQ(1, write(stream_pathname_socket, "b", 1));
+		EXPECT_EQ(0, close(stream_pathname_socket));
+
+		/* Sends without connection. */
+		dgram_pathname_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+		ASSERT_LE(0, dgram_pathname_socket);
+		err = sendto(dgram_pathname_socket, "c", 1, 0,
+			     &dgram_pathname_addr, size_dgram);
+		EXPECT_EQ(1, err);
+
+		/* Sends with connection. */
+		ASSERT_EQ(0, connect(dgram_pathname_socket,
+				     &dgram_pathname_addr, size_dgram));
+		ASSERT_EQ(1, write(dgram_pathname_socket, "d", 1));
+		EXPECT_EQ(0, close(dgram_pathname_socket));
+
+		/* Connects with abstract sockets. */
+		stream_abstract_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, stream_abstract_socket);
+		err = connect(stream_abstract_socket,
+			      &stream_abstract_addr.unix_addr,
+			      stream_abstract_addr.unix_addr_len);
+		if (variant->domain == SCOPE_SANDBOX) {
+			EXPECT_EQ(-1, err);
+			EXPECT_EQ(EPERM, errno);
+		} else {
+			EXPECT_EQ(0, err);
+			ASSERT_EQ(1, write(stream_abstract_socket, "e", 1));
+		}
+		EXPECT_EQ(0, close(stream_abstract_socket));
+
+		/* Sends without connection. */
+		dgram_abstract_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+		ASSERT_LE(0, dgram_abstract_socket);
+		err = sendto(dgram_abstract_socket, "f", 1, 0,
+			     &dgram_abstract_addr.unix_addr,
+			     dgram_abstract_addr.unix_addr_len);
+		if (variant->domain == SCOPE_SANDBOX) {
+			EXPECT_EQ(-1, err);
+			EXPECT_EQ(EPERM, errno);
+		} else {
+			EXPECT_EQ(1, err);
+		}
+
+		/* Sends with connection. */
+		err = connect(dgram_abstract_socket,
+			      &dgram_abstract_addr.unix_addr,
+			      dgram_abstract_addr.unix_addr_len);
+		if (variant->domain == SCOPE_SANDBOX) {
+			EXPECT_EQ(-1, err);
+			EXPECT_EQ(EPERM, errno);
+		} else {
+			EXPECT_EQ(0, err);
+			ASSERT_EQ(1, write(dgram_abstract_socket, "g", 1));
+		}
+		EXPECT_EQ(0, close(dgram_abstract_socket));
+
+		_exit(_metadata->exit_code);
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_parent[0]));
+	EXPECT_EQ(0, close(unnamed_sockets[0]));
+
+	/* Sets up pathname servers. */
+	stream_pathname_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+	ASSERT_LE(0, stream_pathname_socket);
+	ASSERT_EQ(0, bind(stream_pathname_socket, &stream_pathname_addr,
+			  size_stream));
+	ASSERT_EQ(0, listen(stream_pathname_socket, backlog));
+
+	dgram_pathname_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+	ASSERT_LE(0, dgram_pathname_socket);
+	ASSERT_EQ(0, bind(dgram_pathname_socket, &dgram_pathname_addr,
+			  size_dgram));
+
+	/* Sets up abstract servers. */
+	stream_abstract_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+	ASSERT_LE(0, stream_abstract_socket);
+	ASSERT_EQ(0,
+		  bind(stream_abstract_socket, &stream_abstract_addr.unix_addr,
+		       stream_abstract_addr.unix_addr_len));
+
+	dgram_abstract_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+	ASSERT_LE(0, dgram_abstract_socket);
+	ASSERT_EQ(0, bind(dgram_abstract_socket, &dgram_abstract_addr.unix_addr,
+			  dgram_abstract_addr.unix_addr_len));
+	ASSERT_EQ(0, listen(stream_abstract_socket, backlog));
+
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+	EXPECT_EQ(0, close(pipe_parent[1]));
+
+	/* Reads from unnamed socket. */
+	ASSERT_EQ(1, read(unnamed_sockets[1], &buf_parent, sizeof(buf_parent)));
+	ASSERT_EQ('a', buf_parent);
+	EXPECT_LE(0, close(unnamed_sockets[1]));
+
+	/* Reads from pathname sockets. */
+	data_socket = accept(stream_pathname_socket, NULL, NULL);
+	ASSERT_LE(0, data_socket);
+	ASSERT_EQ(1, read(data_socket, &buf_parent, sizeof(buf_parent)));
+	ASSERT_EQ('b', buf_parent);
+	EXPECT_EQ(0, close(data_socket));
+	EXPECT_EQ(0, close(stream_pathname_socket));
+
+	ASSERT_EQ(1,
+		  read(dgram_pathname_socket, &buf_parent, sizeof(buf_parent)));
+	ASSERT_EQ('c', buf_parent);
+	ASSERT_EQ(1,
+		  read(dgram_pathname_socket, &buf_parent, sizeof(buf_parent)));
+	ASSERT_EQ('d', buf_parent);
+	EXPECT_EQ(0, close(dgram_pathname_socket));
+
+	if (variant->domain != SCOPE_SANDBOX) {
+		/* Reads from abstract sockets if allowed to send. */
+		data_socket = accept(stream_abstract_socket, NULL, NULL);
+		ASSERT_LE(0, data_socket);
+		ASSERT_EQ(1,
+			  read(data_socket, &buf_parent, sizeof(buf_parent)));
+		ASSERT_EQ('e', buf_parent);
+		EXPECT_EQ(0, close(data_socket));
+
+		ASSERT_EQ(1, read(dgram_abstract_socket, &buf_parent,
+				  sizeof(buf_parent)));
+		ASSERT_EQ('f', buf_parent);
+		ASSERT_EQ(1, read(dgram_abstract_socket, &buf_parent,
+				  sizeof(buf_parent)));
+		ASSERT_EQ('g', buf_parent);
+	}
+
+	/* Waits for all abstract socket tests. */
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	EXPECT_EQ(0, close(stream_abstract_socket));
+	EXPECT_EQ(0, close(dgram_abstract_socket));
+
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From d1cc0ef80f2377a6970737ed88d95014ca461993 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Wed, 4 Sep 2024 18:13:59 -0600
Subject: selftests/landlock: Test connected and unconnected datagram UNIX
 socket
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check the specific case where a scoped datagram socket is connected and
send(2) works, whereas sendto(2) is denied if the datagram socket is not
connected.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/c28c9cd8feef67dd25e115c401a2389a75f9983b.1725494372.git.fahimitahera@gmail.com
[mic: Use more EXPECT and avoid negative ASSERT, use variables dedicated
per process, remove useless buffer]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 .../selftests/landlock/scoped_abstract_unix_test.c | 103 +++++++++++++++++++++
 1 file changed, 103 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
index 553806c3e27d..ad97f14f97de 100644
--- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
+++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
@@ -871,4 +871,107 @@ TEST_F(various_address_sockets, scoped_pathname_sockets)
 		_metadata->exit_code = KSFT_FAIL;
 }
 
+TEST(datagram_sockets)
+{
+	struct service_fixture connected_addr, non_connected_addr;
+	int server_conn_socket, server_unconn_socket;
+	int pipe_parent[2], pipe_child[2];
+	int status;
+	char buf;
+	pid_t child;
+
+	drop_caps(_metadata);
+	memset(&connected_addr, 0, sizeof(connected_addr));
+	set_unix_address(&connected_addr, 0);
+	memset(&non_connected_addr, 0, sizeof(non_connected_addr));
+	set_unix_address(&non_connected_addr, 1);
+
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+	ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC));
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		int client_conn_socket, client_unconn_socket;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		EXPECT_EQ(0, close(pipe_child[0]));
+
+		client_conn_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+		client_unconn_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+		ASSERT_LE(0, client_conn_socket);
+		ASSERT_LE(0, client_unconn_socket);
+
+		/* Waits for parent to listen. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf, 1));
+		ASSERT_EQ(0,
+			  connect(client_conn_socket, &connected_addr.unix_addr,
+				  connected_addr.unix_addr_len));
+
+		/*
+		 * Both connected and non-connected sockets can send data when
+		 * the domain is not scoped.
+		 */
+		ASSERT_EQ(1, send(client_conn_socket, ".", 1, 0));
+		ASSERT_EQ(1, sendto(client_unconn_socket, ".", 1, 0,
+				    &non_connected_addr.unix_addr,
+				    non_connected_addr.unix_addr_len));
+		ASSERT_EQ(1, write(pipe_child[1], ".", 1));
+
+		/* Scopes the domain. */
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+		/*
+		 * Connected socket sends data to the receiver, but the
+		 * non-connected socket must fail to send data.
+		 */
+		ASSERT_EQ(1, send(client_conn_socket, ".", 1, 0));
+		ASSERT_EQ(-1, sendto(client_unconn_socket, ".", 1, 0,
+				     &non_connected_addr.unix_addr,
+				     non_connected_addr.unix_addr_len));
+		ASSERT_EQ(EPERM, errno);
+		ASSERT_EQ(1, write(pipe_child[1], ".", 1));
+
+		EXPECT_EQ(0, close(client_conn_socket));
+		EXPECT_EQ(0, close(client_unconn_socket));
+		_exit(_metadata->exit_code);
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_parent[0]));
+	EXPECT_EQ(0, close(pipe_child[1]));
+
+	server_conn_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+	server_unconn_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+	ASSERT_LE(0, server_conn_socket);
+	ASSERT_LE(0, server_unconn_socket);
+
+	ASSERT_EQ(0, bind(server_conn_socket, &connected_addr.unix_addr,
+			  connected_addr.unix_addr_len));
+	ASSERT_EQ(0, bind(server_unconn_socket, &non_connected_addr.unix_addr,
+			  non_connected_addr.unix_addr_len));
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+
+	/* Waits for child to test. */
+	ASSERT_EQ(1, read(pipe_child[0], &buf, 1));
+	ASSERT_EQ(1, recv(server_conn_socket, &buf, 1, 0));
+	ASSERT_EQ(1, recv(server_unconn_socket, &buf, 1, 0));
+
+	/*
+	 * Connected datagram socket will receive data, but
+	 * non-connected datagram socket does not receive data.
+	 */
+	ASSERT_EQ(1, read(pipe_child[0], &buf, 1));
+	ASSERT_EQ(1, recv(server_conn_socket, &buf, 1, 0));
+
+	/* Waits for all tests to finish. */
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	EXPECT_EQ(0, close(server_conn_socket));
+	EXPECT_EQ(0, close(server_unconn_socket));
+
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 644a728506c794e9e4f5fb9845ed4f7014cf46d9 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Wed, 4 Sep 2024 18:14:00 -0600
Subject: selftests/landlock: Test inherited restriction of abstract UNIX
 socket
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A socket can be shared between multiple processes, so it can connect and
send data to them. Provide a test scenario where a sandboxed process
inherits a socket's file descriptor. The process cannot connect or send
data to the inherited socket since the process is scoped.

Test coverage for security/landlock is 92.0% of 1013 lines according to
gcc/gcov-14.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/1428574deec13603b6ab2f2ed68ecbfa3b63bcb3.1725494372.git.fahimitahera@gmail.com
[mic: Remove negative ASSERT, fix potential race condition because of
closed connections, remove useless buffer, add test coverage]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 .../selftests/landlock/scoped_abstract_unix_test.c | 64 ++++++++++++++++++++++
 1 file changed, 64 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
index ad97f14f97de..a6b59d2ab1b4 100644
--- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
+++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
@@ -974,4 +974,68 @@ TEST(datagram_sockets)
 		_metadata->exit_code = KSFT_FAIL;
 }
 
+TEST(self_connect)
+{
+	struct service_fixture connected_addr, non_connected_addr;
+	int connected_socket, non_connected_socket, status;
+	pid_t child;
+
+	drop_caps(_metadata);
+	memset(&connected_addr, 0, sizeof(connected_addr));
+	set_unix_address(&connected_addr, 0);
+	memset(&non_connected_addr, 0, sizeof(non_connected_addr));
+	set_unix_address(&non_connected_addr, 1);
+
+	connected_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+	non_connected_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
+	ASSERT_LE(0, connected_socket);
+	ASSERT_LE(0, non_connected_socket);
+
+	ASSERT_EQ(0, bind(connected_socket, &connected_addr.unix_addr,
+			  connected_addr.unix_addr_len));
+	ASSERT_EQ(0, bind(non_connected_socket, &non_connected_addr.unix_addr,
+			  non_connected_addr.unix_addr_len));
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		/* Child's domain is scoped. */
+		create_scoped_domain(_metadata,
+				     LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET);
+
+		/*
+		 * The child inherits the sockets, and cannot connect or
+		 * send data to them.
+		 */
+		ASSERT_EQ(-1,
+			  connect(connected_socket, &connected_addr.unix_addr,
+				  connected_addr.unix_addr_len));
+		ASSERT_EQ(EPERM, errno);
+
+		ASSERT_EQ(-1, sendto(connected_socket, ".", 1, 0,
+				     &connected_addr.unix_addr,
+				     connected_addr.unix_addr_len));
+		ASSERT_EQ(EPERM, errno);
+
+		ASSERT_EQ(-1, sendto(non_connected_socket, ".", 1, 0,
+				     &non_connected_addr.unix_addr,
+				     non_connected_addr.unix_addr_len));
+		ASSERT_EQ(EPERM, errno);
+
+		EXPECT_EQ(0, close(connected_socket));
+		EXPECT_EQ(0, close(non_connected_socket));
+		_exit(_metadata->exit_code);
+		return;
+	}
+
+	/* Waits for all tests to finish. */
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	EXPECT_EQ(0, close(connected_socket));
+	EXPECT_EQ(0, close(non_connected_socket));
+
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 54a6e6bbf3bef25c8eb65619edde70af49bd3db0 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Fri, 6 Sep 2024 15:30:03 -0600
Subject: landlock: Add signal scoping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, a sandbox process is not restricted to sending a signal (e.g.
SIGKILL) to a process outside the sandbox environment.  The ability to
send a signal for a sandboxed process should be scoped the same way
abstract UNIX sockets are scoped. Therefore, we extend the "scoped"
field in a ruleset with LANDLOCK_SCOPE_SIGNAL to specify that a ruleset
will deny sending any signal from within a sandbox process to its parent
(i.e. any parent sandbox or non-sandboxed processes).

This patch adds file_set_fowner and file_free_security hooks to set and
release a pointer to the file owner's domain. This pointer, fown_domain
in landlock_file_security will be used in file_send_sigiotask to check
if the process can send a signal.

The ruleset_with_unknown_scope test is updated to support
LANDLOCK_SCOPE_SIGNAL.

This depends on two new changes:
- commit 1934b212615d ("file: reclaim 24 bytes from f_owner"): replace
  container_of(fown, struct file, f_owner) with fown->file .
- commit 26f204380a3c ("fs: Fix file_set_fowner LSM hook
  inconsistencies"): lock before calling the hook.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Closes: https://github.com/landlock-lsm/linux/issues/8
Link: https://lore.kernel.org/r/df2b4f880a2ed3042992689a793ea0951f6798a5.1725657727.git.fahimitahera@gmail.com
[mic: Update landlock_get_current_domain()'s return type, improve and
fix locking in hook_file_set_fowner(), simplify and fix sleepable call
and locking issue in hook_file_send_sigiotask() and rebase on the latest
VFS tree, simplify hook_task_kill() and quickly return when not
sandboxed, improve comments, rename LANDLOCK_SCOPED_SIGNAL]
Co-developed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h                  |  3 ++
 security/landlock/cred.h                       |  2 +-
 security/landlock/fs.c                         | 25 ++++++++++++
 security/landlock/fs.h                         |  7 ++++
 security/landlock/limits.h                     |  2 +-
 security/landlock/task.c                       | 56 ++++++++++++++++++++++++++
 tools/testing/selftests/landlock/scoped_test.c |  2 +-
 7 files changed, 94 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 70edd17bafdc..33745642f787 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -296,9 +296,12 @@ struct landlock_net_port_attr {
  * - %LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: Restrict a sandboxed process from
  *   connecting to an abstract UNIX socket created by a process outside the
  *   related Landlock domain (e.g. a parent domain or a non-sandboxed process).
+ * - %LANDLOCK_SCOPE_SIGNAL: Restrict a sandboxed process from sending a signal
+ *   to another process outside the domain.
  */
 /* clang-format off */
 #define LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET		(1ULL << 0)
+#define LANDLOCK_SCOPE_SIGNAL		                (1ULL << 1)
 /* clang-format on*/
 
 #endif /* _UAPI_LINUX_LANDLOCK_H */
diff --git a/security/landlock/cred.h b/security/landlock/cred.h
index af89ab00e6d1..bf755459838a 100644
--- a/security/landlock/cred.h
+++ b/security/landlock/cred.h
@@ -26,7 +26,7 @@ landlock_cred(const struct cred *cred)
 	return cred->security + landlock_blob_sizes.lbs_cred;
 }
 
-static inline const struct landlock_ruleset *landlock_get_current_domain(void)
+static inline struct landlock_ruleset *landlock_get_current_domain(void)
 {
 	return landlock_cred(current_cred())->domain;
 }
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 0804f76a67be..7d79fc8abe21 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1639,6 +1639,29 @@ static int hook_file_ioctl_compat(struct file *file, unsigned int cmd,
 	return -EACCES;
 }
 
+static void hook_file_set_fowner(struct file *file)
+{
+	struct landlock_ruleset *new_dom, *prev_dom;
+
+	/*
+	 * Lock already held by __f_setown(), see commit 26f204380a3c ("fs: Fix
+	 * file_set_fowner LSM hook inconsistencies").
+	 */
+	lockdep_assert_held(&file_f_owner(file)->lock);
+	new_dom = landlock_get_current_domain();
+	landlock_get_ruleset(new_dom);
+	prev_dom = landlock_file(file)->fown_domain;
+	landlock_file(file)->fown_domain = new_dom;
+
+	/* Called in an RCU read-side critical section. */
+	landlock_put_ruleset_deferred(prev_dom);
+}
+
+static void hook_file_free_security(struct file *file)
+{
+	landlock_put_ruleset_deferred(landlock_file(file)->fown_domain);
+}
+
 static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(inode_free_security_rcu, hook_inode_free_security_rcu),
 
@@ -1663,6 +1686,8 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(file_truncate, hook_file_truncate),
 	LSM_HOOK_INIT(file_ioctl, hook_file_ioctl),
 	LSM_HOOK_INIT(file_ioctl_compat, hook_file_ioctl_compat),
+	LSM_HOOK_INIT(file_set_fowner, hook_file_set_fowner),
+	LSM_HOOK_INIT(file_free_security, hook_file_free_security),
 };
 
 __init void landlock_add_fs_hooks(void)
diff --git a/security/landlock/fs.h b/security/landlock/fs.h
index 488e4813680a..1487e1f023a1 100644
--- a/security/landlock/fs.h
+++ b/security/landlock/fs.h
@@ -52,6 +52,13 @@ struct landlock_file_security {
 	 * needed to authorize later operations on the open file.
 	 */
 	access_mask_t allowed_access;
+	/**
+	 * @fown_domain: Domain of the task that set the PID that may receive a
+	 * signal e.g., SIGURG when writing MSG_OOB to the related socket.
+	 * This pointer is protected by the related file->f_owner->lock, as for
+	 * fown_struct's members: pid, uid, and euid.
+	 */
+	struct landlock_ruleset *fown_domain;
 };
 
 /**
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index d74818003ed4..15f7606066c8 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -26,7 +26,7 @@
 #define LANDLOCK_MASK_ACCESS_NET	((LANDLOCK_LAST_ACCESS_NET << 1) - 1)
 #define LANDLOCK_NUM_ACCESS_NET		__const_hweight64(LANDLOCK_MASK_ACCESS_NET)
 
-#define LANDLOCK_LAST_SCOPE		LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET
+#define LANDLOCK_LAST_SCOPE		LANDLOCK_SCOPE_SIGNAL
 #define LANDLOCK_MASK_SCOPE		((LANDLOCK_LAST_SCOPE << 1) - 1)
 #define LANDLOCK_NUM_SCOPE		__const_hweight64(LANDLOCK_MASK_SCOPE)
 /* clang-format on */
diff --git a/security/landlock/task.c b/security/landlock/task.c
index 4f8013ca412e..4acbd7c40eee 100644
--- a/security/landlock/task.c
+++ b/security/landlock/task.c
@@ -18,6 +18,7 @@
 
 #include "common.h"
 #include "cred.h"
+#include "fs.h"
 #include "ruleset.h"
 #include "setup.h"
 #include "task.h"
@@ -242,12 +243,67 @@ static int hook_unix_may_send(struct socket *const sock,
 	return 0;
 }
 
+static int hook_task_kill(struct task_struct *const p,
+			  struct kernel_siginfo *const info, const int sig,
+			  const struct cred *const cred)
+{
+	bool is_scoped;
+	const struct landlock_ruleset *dom;
+
+	if (cred) {
+		/* Dealing with USB IO. */
+		dom = landlock_cred(cred)->domain;
+	} else {
+		dom = landlock_get_current_domain();
+	}
+
+	/* Quick return for non-landlocked tasks. */
+	if (!dom)
+		return 0;
+
+	rcu_read_lock();
+	is_scoped = domain_is_scoped(dom, landlock_get_task_domain(p),
+				     LANDLOCK_SCOPE_SIGNAL);
+	rcu_read_unlock();
+	if (is_scoped)
+		return -EPERM;
+
+	return 0;
+}
+
+static int hook_file_send_sigiotask(struct task_struct *tsk,
+				    struct fown_struct *fown, int signum)
+{
+	const struct landlock_ruleset *dom;
+	bool is_scoped = false;
+
+	/* Lock already held by send_sigio() and send_sigurg(). */
+	lockdep_assert_held(&fown->lock);
+	dom = landlock_file(fown->file)->fown_domain;
+
+	/* Quick return for unowned socket. */
+	if (!dom)
+		return 0;
+
+	rcu_read_lock();
+	is_scoped = domain_is_scoped(dom, landlock_get_task_domain(tsk),
+				     LANDLOCK_SCOPE_SIGNAL);
+	rcu_read_unlock();
+	if (is_scoped)
+		return -EPERM;
+
+	return 0;
+}
+
 static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme),
 
 	LSM_HOOK_INIT(unix_stream_connect, hook_unix_stream_connect),
 	LSM_HOOK_INIT(unix_may_send, hook_unix_may_send),
+
+	LSM_HOOK_INIT(task_kill, hook_task_kill),
+	LSM_HOOK_INIT(file_send_sigiotask, hook_file_send_sigiotask),
 };
 
 __init void landlock_add_task_hooks(void)
diff --git a/tools/testing/selftests/landlock/scoped_test.c b/tools/testing/selftests/landlock/scoped_test.c
index 2d15f09d63e2..b90f76ed0d9c 100644
--- a/tools/testing/selftests/landlock/scoped_test.c
+++ b/tools/testing/selftests/landlock/scoped_test.c
@@ -12,7 +12,7 @@
 
 #include "common.h"
 
-#define ACCESS_LAST LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET
+#define ACCESS_LAST LANDLOCK_SCOPE_SIGNAL
 
 TEST(ruleset_with_unknown_scope)
 {
-- 
cgit v1.2.3


From ea292363c322d82663e021c5dc532894c30d4e59 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Fri, 6 Sep 2024 15:30:04 -0600
Subject: selftests/landlock: Test signal scoping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide tests for the signal scoping.  If the signal is 0, no signal
will be sent, but the permission of a process to send a signal will be
checked.  Likewise, this test consider one signal for each signal
category: SIGTRAP, SIGURG, SIGHUP, and SIGTSTP.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/15dc202bb7f0a462ddeaa0c1cd630d2a7c6fa5c5.1725657728.git.fahimitahera@gmail.com
[mic: Fix commit message, use dedicated variables per process, properly
close FDs, extend send_sig_to_parent to make sure scoping works as
expected]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 .../selftests/landlock/scoped_signal_test.c        | 251 +++++++++++++++++++++
 1 file changed, 251 insertions(+)
 create mode 100644 tools/testing/selftests/landlock/scoped_signal_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/scoped_signal_test.c b/tools/testing/selftests/landlock/scoped_signal_test.c
new file mode 100644
index 000000000000..55d983e38fc5
--- /dev/null
+++ b/tools/testing/selftests/landlock/scoped_signal_test.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - Signal Scoping
+ *
+ * Copyright © 2024 Tahera Fahimi <fahimitahera@gmail.com>
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/landlock.h>
+#include <signal.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "common.h"
+#include "scoped_common.h"
+
+/* This variable is used for handling several signals. */
+static volatile sig_atomic_t is_signaled;
+
+/* clang-format off */
+FIXTURE(scoping_signals) {};
+/* clang-format on */
+
+FIXTURE_VARIANT(scoping_signals)
+{
+	int sig;
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoping_signals, sigtrap) {
+	/* clang-format on */
+	.sig = SIGTRAP,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoping_signals, sigurg) {
+	/* clang-format on */
+	.sig = SIGURG,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoping_signals, sighup) {
+	/* clang-format on */
+	.sig = SIGHUP,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(scoping_signals, sigtstp) {
+	/* clang-format on */
+	.sig = SIGTSTP,
+};
+
+FIXTURE_SETUP(scoping_signals)
+{
+	drop_caps(_metadata);
+
+	is_signaled = 0;
+}
+
+FIXTURE_TEARDOWN(scoping_signals)
+{
+}
+
+static void scope_signal_handler(int sig, siginfo_t *info, void *ucontext)
+{
+	if (sig == SIGTRAP || sig == SIGURG || sig == SIGHUP || sig == SIGTSTP)
+		is_signaled = 1;
+}
+
+/*
+ * In this test, a child process sends a signal to parent before and
+ * after getting scoped.
+ */
+TEST_F(scoping_signals, send_sig_to_parent)
+{
+	int pipe_parent[2];
+	int status;
+	pid_t child;
+	pid_t parent = getpid();
+	struct sigaction action = {
+		.sa_sigaction = scope_signal_handler,
+		.sa_flags = SA_SIGINFO,
+
+	};
+
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+	ASSERT_LE(0, sigaction(variant->sig, &action, NULL));
+
+	/* The process should not have already been signaled. */
+	EXPECT_EQ(0, is_signaled);
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		char buf_child;
+		int err;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+
+		/*
+		 * The child process can send signal to parent when
+		 * domain is not scoped.
+		 */
+		err = kill(parent, variant->sig);
+		ASSERT_EQ(0, err);
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1));
+		EXPECT_EQ(0, close(pipe_parent[0]));
+
+		create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL);
+
+		/*
+		 * The child process cannot send signal to the parent
+		 * anymore.
+		 */
+		err = kill(parent, variant->sig);
+		ASSERT_EQ(-1, err);
+		ASSERT_EQ(EPERM, errno);
+
+		/*
+		 * No matter of the domain, a process should be able to
+		 * send a signal to itself.
+		 */
+		ASSERT_EQ(0, is_signaled);
+		ASSERT_EQ(0, raise(variant->sig));
+		ASSERT_EQ(1, is_signaled);
+
+		_exit(_metadata->exit_code);
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_parent[0]));
+
+	/* Waits for a first signal to be received, without race condition. */
+	while (!is_signaled && !usleep(1))
+		;
+	ASSERT_EQ(1, is_signaled);
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+	EXPECT_EQ(0, close(pipe_parent[1]));
+	is_signaled = 0;
+
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+
+	EXPECT_EQ(0, is_signaled);
+}
+
+/* clang-format off */
+FIXTURE(scoped_domains) {};
+/* clang-format on */
+
+#include "scoped_base_variants.h"
+
+FIXTURE_SETUP(scoped_domains)
+{
+	drop_caps(_metadata);
+}
+
+FIXTURE_TEARDOWN(scoped_domains)
+{
+}
+
+/*
+ * This test ensures that a scoped process cannot send signal out of
+ * scoped domain.
+ */
+TEST_F(scoped_domains, check_access_signal)
+{
+	pid_t child;
+	pid_t parent = getpid();
+	int status;
+	bool can_signal_child, can_signal_parent;
+	int pipe_parent[2], pipe_child[2];
+	char buf_parent;
+	int err;
+
+	can_signal_parent = !variant->domain_child;
+	can_signal_child = !variant->domain_parent;
+
+	if (variant->domain_both)
+		create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL);
+
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+	ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC));
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		char buf_child;
+
+		EXPECT_EQ(0, close(pipe_child[0]));
+		EXPECT_EQ(0, close(pipe_parent[1]));
+
+		if (variant->domain_child)
+			create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL);
+
+		ASSERT_EQ(1, write(pipe_child[1], ".", 1));
+		EXPECT_EQ(0, close(pipe_child[1]));
+
+		/* Waits for the parent to send signals. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1));
+		EXPECT_EQ(0, close(pipe_parent[0]));
+
+		err = kill(parent, 0);
+		if (can_signal_parent) {
+			ASSERT_EQ(0, err);
+		} else {
+			ASSERT_EQ(-1, err);
+			ASSERT_EQ(EPERM, errno);
+		}
+		/*
+		 * No matter of the domain, a process should be able to
+		 * send a signal to itself.
+		 */
+		ASSERT_EQ(0, raise(0));
+
+		_exit(_metadata->exit_code);
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_parent[0]));
+	EXPECT_EQ(0, close(pipe_child[1]));
+
+	if (variant->domain_parent)
+		create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL);
+
+	ASSERT_EQ(1, read(pipe_child[0], &buf_parent, 1));
+	EXPECT_EQ(0, close(pipe_child[0]));
+
+	err = kill(child, 0);
+	if (can_signal_child) {
+		ASSERT_EQ(0, err);
+	} else {
+		ASSERT_EQ(-1, err);
+		ASSERT_EQ(EPERM, errno);
+	}
+	ASSERT_EQ(0, raise(0));
+
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+	EXPECT_EQ(0, close(pipe_parent[1]));
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From c8994965013ee62cfb0f3cec01aa447f9077a869 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Fri, 6 Sep 2024 15:30:05 -0600
Subject: selftests/landlock: Test signal scoping for threads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expand the signal scoping tests with pthread_kill(3).  Test if a scoped
thread can send signal to a process in the same scoped domain, or a
non-sandboxed thread.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/c15e9eafbb2da1210e46ba8db7b8907f5ea11009.1725657728.git.fahimitahera@gmail.com
[mic: Improve commit message]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 .../selftests/landlock/scoped_signal_test.c        | 49 ++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/scoped_signal_test.c b/tools/testing/selftests/landlock/scoped_signal_test.c
index 55d983e38fc5..4426b8a8b718 100644
--- a/tools/testing/selftests/landlock/scoped_signal_test.c
+++ b/tools/testing/selftests/landlock/scoped_signal_test.c
@@ -9,6 +9,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/landlock.h>
+#include <pthread.h>
 #include <signal.h>
 #include <sys/prctl.h>
 #include <sys/types.h>
@@ -248,4 +249,52 @@ TEST_F(scoped_domains, check_access_signal)
 		_metadata->exit_code = KSFT_FAIL;
 }
 
+static int thread_pipe[2];
+
+enum thread_return {
+	THREAD_INVALID = 0,
+	THREAD_SUCCESS = 1,
+	THREAD_ERROR = 2,
+};
+
+void *thread_func(void *arg)
+{
+	char buf;
+
+	if (read(thread_pipe[0], &buf, 1) != 1)
+		return (void *)THREAD_ERROR;
+
+	return (void *)THREAD_SUCCESS;
+}
+
+TEST(signal_scoping_threads)
+{
+	pthread_t no_sandbox_thread, scoped_thread;
+	enum thread_return ret = THREAD_INVALID;
+
+	drop_caps(_metadata);
+	ASSERT_EQ(0, pipe2(thread_pipe, O_CLOEXEC));
+
+	ASSERT_EQ(0,
+		  pthread_create(&no_sandbox_thread, NULL, thread_func, NULL));
+
+	/* Restricts the domain after creating the first thread. */
+	create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL);
+
+	ASSERT_EQ(EPERM, pthread_kill(no_sandbox_thread, 0));
+	ASSERT_EQ(1, write(thread_pipe[1], ".", 1));
+
+	ASSERT_EQ(0, pthread_create(&scoped_thread, NULL, thread_func, NULL));
+	ASSERT_EQ(0, pthread_kill(scoped_thread, 0));
+	ASSERT_EQ(1, write(thread_pipe[1], ".", 1));
+
+	EXPECT_EQ(0, pthread_join(no_sandbox_thread, (void **)&ret));
+	EXPECT_EQ(THREAD_SUCCESS, ret);
+	EXPECT_EQ(0, pthread_join(scoped_thread, (void **)&ret));
+	EXPECT_EQ(THREAD_SUCCESS, ret);
+
+	EXPECT_EQ(0, close(thread_pipe[0]));
+	EXPECT_EQ(0, close(thread_pipe[1]));
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From f34e9ce5f4794387121120b2d2ff5aa265ef6ce9 Mon Sep 17 00:00:00 2001
From: Tahera Fahimi <fahimitahera@gmail.com>
Date: Fri, 6 Sep 2024 15:30:06 -0600
Subject: selftests/landlock: Test signal created by out-of-bound message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a test to verify that the SIGURG signal created by an out-of-bound
message in UNIX sockets is well controlled by the file_send_sigiotask
hook.

Test coverage for security/landlock is 92.2% of 1046 lines according to
gcc/gcov-14.

Signed-off-by: Tahera Fahimi <fahimitahera@gmail.com>
Link: https://lore.kernel.org/r/50daeed4d4f60d71e9564d0f24004a373fc5f7d5.1725657728.git.fahimitahera@gmail.com
[mic: Improve commit message and add test coverage, improve test with
four variants to fully cover the hook, use abstract unix socket to avoid
managing a file, use dedicated variable per process, add comments, avoid
negative ASSERT, move close calls]
Co-developed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 .../selftests/landlock/scoped_signal_test.c        | 184 +++++++++++++++++++++
 1 file changed, 184 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/scoped_signal_test.c b/tools/testing/selftests/landlock/scoped_signal_test.c
index 4426b8a8b718..475ee62a832d 100644
--- a/tools/testing/selftests/landlock/scoped_signal_test.c
+++ b/tools/testing/selftests/landlock/scoped_signal_test.c
@@ -297,4 +297,188 @@ TEST(signal_scoping_threads)
 	EXPECT_EQ(0, close(thread_pipe[1]));
 }
 
+const short backlog = 10;
+
+static volatile sig_atomic_t signal_received;
+
+static void handle_sigurg(int sig)
+{
+	if (sig == SIGURG)
+		signal_received = 1;
+	else
+		signal_received = -1;
+}
+
+static int setup_signal_handler(int signal)
+{
+	struct sigaction sa = {
+		.sa_handler = handle_sigurg,
+	};
+
+	if (sigemptyset(&sa.sa_mask))
+		return -1;
+
+	sa.sa_flags = SA_SIGINFO | SA_RESTART;
+	return sigaction(SIGURG, &sa, NULL);
+}
+
+/* clang-format off */
+FIXTURE(fown) {};
+/* clang-format on */
+
+enum fown_sandbox {
+	SANDBOX_NONE,
+	SANDBOX_BEFORE_FORK,
+	SANDBOX_BEFORE_SETOWN,
+	SANDBOX_AFTER_SETOWN,
+};
+
+FIXTURE_VARIANT(fown)
+{
+	const enum fown_sandbox sandbox_setown;
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(fown, no_sandbox) {
+	/* clang-format on */
+	.sandbox_setown = SANDBOX_NONE,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(fown, sandbox_before_fork) {
+	/* clang-format on */
+	.sandbox_setown = SANDBOX_BEFORE_FORK,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(fown, sandbox_before_setown) {
+	/* clang-format on */
+	.sandbox_setown = SANDBOX_BEFORE_SETOWN,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(fown, sandbox_after_setown) {
+	/* clang-format on */
+	.sandbox_setown = SANDBOX_AFTER_SETOWN,
+};
+
+FIXTURE_SETUP(fown)
+{
+	drop_caps(_metadata);
+}
+
+FIXTURE_TEARDOWN(fown)
+{
+}
+
+/*
+ * Sending an out of bound message will trigger the SIGURG signal
+ * through file_send_sigiotask.
+ */
+TEST_F(fown, sigurg_socket)
+{
+	int server_socket, recv_socket;
+	struct service_fixture server_address;
+	char buffer_parent;
+	int status;
+	int pipe_parent[2], pipe_child[2];
+	pid_t child;
+
+	memset(&server_address, 0, sizeof(server_address));
+	set_unix_address(&server_address, 0);
+
+	ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+	ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC));
+
+	if (variant->sandbox_setown == SANDBOX_BEFORE_FORK)
+		create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL);
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		int client_socket;
+		char buffer_child;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		EXPECT_EQ(0, close(pipe_child[0]));
+
+		ASSERT_EQ(0, setup_signal_handler(SIGURG));
+		client_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, client_socket);
+
+		/* Waits for the parent to listen. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buffer_child, 1));
+		ASSERT_EQ(0, connect(client_socket, &server_address.unix_addr,
+				     server_address.unix_addr_len));
+
+		/*
+		 * Waits for the parent to accept the connection, sandbox
+		 * itself, and call fcntl(2).
+		 */
+		ASSERT_EQ(1, read(pipe_parent[0], &buffer_child, 1));
+		/* May signal itself. */
+		ASSERT_EQ(1, send(client_socket, ".", 1, MSG_OOB));
+		EXPECT_EQ(0, close(client_socket));
+		ASSERT_EQ(1, write(pipe_child[1], ".", 1));
+		EXPECT_EQ(0, close(pipe_child[1]));
+
+		/* Waits for the message to be received. */
+		ASSERT_EQ(1, read(pipe_parent[0], &buffer_child, 1));
+		EXPECT_EQ(0, close(pipe_parent[0]));
+
+		if (variant->sandbox_setown == SANDBOX_BEFORE_SETOWN) {
+			ASSERT_EQ(0, signal_received);
+		} else {
+			/*
+			 * A signal is only received if fcntl(F_SETOWN) was
+			 * called before any sandboxing or if the signal
+			 * receiver is in the same domain.
+			 */
+			ASSERT_EQ(1, signal_received);
+		}
+		_exit(_metadata->exit_code);
+		return;
+	}
+	EXPECT_EQ(0, close(pipe_parent[0]));
+	EXPECT_EQ(0, close(pipe_child[1]));
+
+	server_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+	ASSERT_LE(0, server_socket);
+	ASSERT_EQ(0, bind(server_socket, &server_address.unix_addr,
+			  server_address.unix_addr_len));
+	ASSERT_EQ(0, listen(server_socket, backlog));
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+
+	recv_socket = accept(server_socket, NULL, NULL);
+	ASSERT_LE(0, recv_socket);
+
+	if (variant->sandbox_setown == SANDBOX_BEFORE_SETOWN)
+		create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL);
+
+	/*
+	 * Sets the child to receive SIGURG for MSG_OOB.  This uncommon use is
+	 * a valid attack scenario which also simplifies this test.
+	 */
+	ASSERT_EQ(0, fcntl(recv_socket, F_SETOWN, child));
+
+	if (variant->sandbox_setown == SANDBOX_AFTER_SETOWN)
+		create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL);
+
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+
+	/* Waits for the child to send MSG_OOB. */
+	ASSERT_EQ(1, read(pipe_child[0], &buffer_parent, 1));
+	EXPECT_EQ(0, close(pipe_child[0]));
+	ASSERT_EQ(1, recv(recv_socket, &buffer_parent, 1, MSG_OOB));
+	EXPECT_EQ(0, close(recv_socket));
+	EXPECT_EQ(0, close(server_socket));
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+	EXPECT_EQ(0, close(pipe_parent[1]));
+
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From da5b2ad1c2f18834cb1ce429e2e5a5cf5cbdf21b Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 17 Sep 2024 22:23:09 +0800
Subject: objtool: Handle frame pointer related instructions

After commit a0f7085f6a63 ("LoongArch: Add RANDOMIZE_KSTACK_OFFSET
support"), there are three new instructions "addi.d $fp, $sp, 32",
"sub.d $sp, $sp, $t0" and "addi.d $sp, $fp, -32" for the secondary
stack in do_syscall(), then there is a objtool warning "return with
modified stack frame" and no handle_syscall() which is the previous
frame of do_syscall() in the call trace when executing the command
"echo l > /proc/sysrq-trigger".

objdump shows something like this:

0000000000000000 <do_syscall>:
   0:   02ff8063        addi.d          $sp, $sp, -32
   4:   29c04076        st.d            $fp, $sp, 16
   8:   29c02077        st.d            $s0, $sp, 8
   c:   29c06061        st.d            $ra, $sp, 24
  10:   02c08076        addi.d          $fp, $sp, 32
  ...
  74:   0011b063        sub.d           $sp, $sp, $t0
  ...
  a8:   4c000181        jirl            $ra, $t0, 0
  ...
  dc:   02ff82c3        addi.d          $sp, $fp, -32
  e0:   28c06061        ld.d            $ra, $sp, 24
  e4:   28c04076        ld.d            $fp, $sp, 16
  e8:   28c02077        ld.d            $s0, $sp, 8
  ec:   02c08063        addi.d          $sp, $sp, 32
  f0:   4c000020        jirl            $zero, $ra, 0

The instruction "sub.d $sp, $sp, $t0" changes the stack bottom and the
new stack size is a random value, in order to find the return address of
do_syscall() which is stored in the original stack frame after executing
"jirl $ra, $t0, 0", it should use fp which points to the original stack
top.

At the beginning, the thought is tended to decode the secondary stack
instruction "sub.d $sp, $sp, $t0" and set it as a label, then check this
label for the two frame pointer instructions to change the cfa base and
cfa offset during the period of secondary stack in update_cfi_state().
This is valid for GCC but invalid for Clang due to there are different
secondary stack instructions for ClangBuiltLinux on LoongArch, something
like this:

0000000000000000 <do_syscall>:
  ...
  88:   00119064        sub.d           $a0, $sp, $a0
  8c:   00150083        or              $sp, $a0, $zero
  ...

Actually, it equals to a single instruction "sub.d $sp, $sp, $a0", but
there is no proper condition to check it as a label like GCC, and so the
beginning thought is not a good way.

Essentially, there are two special frame pointer instructions which are
"addi.d $fp, $sp, imm" and "addi.d $sp, $fp, imm", the first one points
fp to the original stack top and the second one restores the original
stack bottom from fp.

Based on the above analysis, in order to avoid adding an arch-specific
update_cfi_state(), we just add a member "frame_pointer" in the "struct
symbol" as a label to avoid affecting the current normal case, then set
it as true only if there is "addi.d $sp, $fp, imm". The last is to check
this label for the two frame pointer instructions to change the cfa base
and cfa offset in update_cfi_state().

Tested with the following two configs:
(1) CONFIG_RANDOMIZE_KSTACK_OFFSET=y &&
    CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=n
(2) CONFIG_RANDOMIZE_KSTACK_OFFSET=y &&
    CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y

By the way, there is no effect for x86 with this patch, tested on the
x86 machine with Fedora 40 system.

Cc: stable@vger.kernel.org # 6.9+
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 tools/objtool/arch/loongarch/decode.c | 11 ++++++++++-
 tools/objtool/check.c                 | 23 ++++++++++++++++++++---
 tools/objtool/include/objtool/elf.h   |  1 +
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c
index aee479d2191c..69b66994f2a1 100644
--- a/tools/objtool/arch/loongarch/decode.c
+++ b/tools/objtool/arch/loongarch/decode.c
@@ -122,7 +122,7 @@ static bool decode_insn_reg2i12_fomat(union loongarch_instruction inst,
 	switch (inst.reg2i12_format.opcode) {
 	case addid_op:
 		if ((inst.reg2i12_format.rd == CFI_SP) || (inst.reg2i12_format.rj == CFI_SP)) {
-			/* addi.d sp,sp,si12 or addi.d fp,sp,si12 */
+			/* addi.d sp,sp,si12 or addi.d fp,sp,si12 or addi.d sp,fp,si12 */
 			insn->immediate = sign_extend64(inst.reg2i12_format.immediate, 11);
 			ADD_OP(op) {
 				op->src.type = OP_SRC_ADD;
@@ -132,6 +132,15 @@ static bool decode_insn_reg2i12_fomat(union loongarch_instruction inst,
 				op->dest.reg = inst.reg2i12_format.rd;
 			}
 		}
+		if ((inst.reg2i12_format.rd == CFI_SP) && (inst.reg2i12_format.rj == CFI_FP)) {
+			/* addi.d sp,fp,si12 */
+			struct symbol *func = find_func_containing(insn->sec, insn->offset);
+
+			if (!func)
+				return false;
+
+			func->frame_pointer = true;
+		}
 		break;
 	case ldd_op:
 		if (inst.reg2i12_format.rj == CFI_SP) {
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 01237d167223..af9cfed7f4ec 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2993,10 +2993,27 @@ static int update_cfi_state(struct instruction *insn,
 				break;
 			}
 
-			if (op->dest.reg == CFI_SP && op->src.reg == CFI_BP) {
+			if (op->dest.reg == CFI_BP && op->src.reg == CFI_SP &&
+			    insn->sym->frame_pointer) {
+				/* addi.d fp,sp,imm on LoongArch */
+				if (cfa->base == CFI_SP && cfa->offset == op->src.offset) {
+					cfa->base = CFI_BP;
+					cfa->offset = 0;
+				}
+				break;
+			}
 
-				/* lea disp(%rbp), %rsp */
-				cfi->stack_size = -(op->src.offset + regs[CFI_BP].offset);
+			if (op->dest.reg == CFI_SP && op->src.reg == CFI_BP) {
+				/* addi.d sp,fp,imm on LoongArch */
+				if (cfa->base == CFI_BP && cfa->offset == 0) {
+					if (insn->sym->frame_pointer) {
+						cfa->base = CFI_SP;
+						cfa->offset = -op->src.offset;
+					}
+				} else {
+					/* lea disp(%rbp), %rsp */
+					cfi->stack_size = -(op->src.offset + regs[CFI_BP].offset);
+				}
 				break;
 			}
 
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 2b8a69de4db8..d7e815c2fd15 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -68,6 +68,7 @@ struct symbol {
 	u8 warned	     : 1;
 	u8 embedded_insn     : 1;
 	u8 local_label       : 1;
+	u8 frame_pointer     : 1;
 	struct list_head pv_target;
 	struct reloc *relocs;
 };
-- 
cgit v1.2.3


From 11c2dbd7f2415731ac8213d3de0dbd5f5dd0ddc7 Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Thu, 8 Aug 2024 11:41:40 -0700
Subject: selftests: riscv: Allow mmap test to compile on 32-bit

Macros needed for 32-bit compilations were hidden behind 64-bit riscv
ifdefs. Fix the 32-bit compilations by moving macros to allow the
memory_layout test to run on 32-bit.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Fixes: 73d05262a2ca ("selftests: riscv: Generalize mm selftests")
Link: https://lore.kernel.org/r/20240808-mmap_tests__fixes-v1-1-b1344b642a84@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 tools/testing/selftests/riscv/mm/mmap_test.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/mm/mmap_test.h b/tools/testing/selftests/riscv/mm/mmap_test.h
index 3b29ca3bb3d4..1c3313c201d5 100644
--- a/tools/testing/selftests/riscv/mm/mmap_test.h
+++ b/tools/testing/selftests/riscv/mm/mmap_test.h
@@ -48,11 +48,11 @@ uint32_t random_addresses[] = {
 };
 #endif
 
-// Only works on 64 bit
-#if __riscv_xlen == 64
 #define PROT (PROT_READ | PROT_WRITE)
 #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
 
+// Only works on 64 bit
+#if __riscv_xlen == 64
 /* mmap must return a value that doesn't use more bits than the hint address. */
 static inline unsigned long get_max_value(unsigned long input)
 {
@@ -80,6 +80,8 @@ static inline unsigned long get_max_value(unsigned long input)
 	})
 #endif /* __riscv_xlen == 64 */
 
+#define TEST_MMAPS do { } while (0)
+
 static inline int memory_layout(void)
 {
 	void *value1 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0);
-- 
cgit v1.2.3


From 27141f1950b8e40499a7aa95341cd5a37029a097 Mon Sep 17 00:00:00 2001
From: Michal Suchanek <msuchanek@suse.de>
Date: Wed, 28 Aug 2024 13:23:20 +0200
Subject: selftests: tpm2: test_smoke: Run only when TPM2 is avaialable.

Since Linux 5.6 tpm_version_major sysfs file is avaialble which gives
the TPM version.

Using this file the test can be skipped on systems with TPM 1.2.

Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 tools/testing/selftests/tpm2/test_smoke.sh | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh
index 20fa70f970a9..168f4b166234 100755
--- a/tools/testing/selftests/tpm2/test_smoke.sh
+++ b/tools/testing/selftests/tpm2/test_smoke.sh
@@ -5,5 +5,7 @@
 ksft_skip=4
 
 [ -e /dev/tpm0 ] || exit $ksft_skip
+read tpm_version < /sys/class/tpm/tpm0/tpm_version_major
+[ "$tpm_version" == 2 ] || exit $ksft_skip
 
 python3 -m unittest -v tpm2_tests.SmokeTest 2>&1
-- 
cgit v1.2.3


From 6d74d178fe6eaf61e384f3be6ba64150bddce8a6 Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Tue, 6 Aug 2024 15:01:23 -0700
Subject: tools: Add riscv barrier implementation

Many of the other architectures use their custom barrier implementations.
Use the barrier code from the kernel sources to optimize barriers in
tools.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Andrea Parri <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20240806-optimize_ring_buffer_read_riscv-v2-1-ca7e193ae198@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 tools/arch/riscv/include/asm/barrier.h | 39 ++++++++++++++++++++++++++++++++++
 tools/arch/riscv/include/asm/fence.h   | 13 ++++++++++++
 tools/include/asm/barrier.h            |  2 ++
 3 files changed, 54 insertions(+)
 create mode 100644 tools/arch/riscv/include/asm/barrier.h
 create mode 100644 tools/arch/riscv/include/asm/fence.h

(limited to 'tools')

diff --git a/tools/arch/riscv/include/asm/barrier.h b/tools/arch/riscv/include/asm/barrier.h
new file mode 100644
index 000000000000..6997f197086d
--- /dev/null
+++ b/tools/arch/riscv/include/asm/barrier.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copied from the kernel sources to tools/arch/riscv:
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2013 Regents of the University of California
+ * Copyright (C) 2017 SiFive
+ */
+
+#ifndef _TOOLS_LINUX_ASM_RISCV_BARRIER_H
+#define _TOOLS_LINUX_ASM_RISCV_BARRIER_H
+
+#include <asm/fence.h>
+#include <linux/compiler.h>
+
+/* These barriers need to enforce ordering on both devices and memory. */
+#define mb()		RISCV_FENCE(iorw, iorw)
+#define rmb()		RISCV_FENCE(ir, ir)
+#define wmb()		RISCV_FENCE(ow, ow)
+
+/* These barriers do not need to enforce ordering on devices, just memory. */
+#define smp_mb()	RISCV_FENCE(rw, rw)
+#define smp_rmb()	RISCV_FENCE(r, r)
+#define smp_wmb()	RISCV_FENCE(w, w)
+
+#define smp_store_release(p, v)						\
+do {									\
+	RISCV_FENCE(rw, w);						\
+	WRITE_ONCE(*p, v);						\
+} while (0)
+
+#define smp_load_acquire(p)						\
+({									\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
+	RISCV_FENCE(r, rw);						\
+	___p1;								\
+})
+
+#endif /* _TOOLS_LINUX_ASM_RISCV_BARRIER_H */
diff --git a/tools/arch/riscv/include/asm/fence.h b/tools/arch/riscv/include/asm/fence.h
new file mode 100644
index 000000000000..37860e86771d
--- /dev/null
+++ b/tools/arch/riscv/include/asm/fence.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copied from the kernel sources to tools/arch/riscv:
+ */
+
+#ifndef _ASM_RISCV_FENCE_H
+#define _ASM_RISCV_FENCE_H
+
+#define RISCV_FENCE_ASM(p, s)		"\tfence " #p "," #s "\n"
+#define RISCV_FENCE(p, s) \
+	({ __asm__ __volatile__ (RISCV_FENCE_ASM(p, s) : : : "memory"); })
+
+#endif	/* _ASM_RISCV_FENCE_H */
diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h
index 8d378c57cb01..0c21678ac5e6 100644
--- a/tools/include/asm/barrier.h
+++ b/tools/include/asm/barrier.h
@@ -8,6 +8,8 @@
 #include "../../arch/arm64/include/asm/barrier.h"
 #elif defined(__powerpc__)
 #include "../../arch/powerpc/include/asm/barrier.h"
+#elif defined(__riscv)
+#include "../../arch/riscv/include/asm/barrier.h"
 #elif defined(__s390__)
 #include "../../arch/s390/include/asm/barrier.h"
 #elif defined(__sh__)
-- 
cgit v1.2.3


From aa5736dc7aa4d6f0e5e4e4147d9aef42bb82deab Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Tue, 6 Aug 2024 15:01:24 -0700
Subject: tools: Optimize ring buffer for riscv

Now that the riscv tools tree supports optimized barriers, use them in
the ring buffer.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Andrea Parri <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20240806-optimize_ring_buffer_read_riscv-v2-2-ca7e193ae198@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 tools/include/linux/ring_buffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/linux/ring_buffer.h b/tools/include/linux/ring_buffer.h
index 6c02617377c2..a74c397359c7 100644
--- a/tools/include/linux/ring_buffer.h
+++ b/tools/include/linux/ring_buffer.h
@@ -55,7 +55,7 @@ static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
  * READ_ONCE() + smp_mb() pair.
  */
 #if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
-    defined(__ia64__) || defined(__sparc__) && defined(__arch64__)
+    defined(__ia64__) || defined(__sparc__) && defined(__arch64__) || defined(__riscv)
 	return smp_load_acquire(&base->data_head);
 #else
 	u64 head = READ_ONCE(base->data_head);
-- 
cgit v1.2.3


From 4b132aacb0768ac1e652cf517097ea6f237214b9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 13 Sep 2024 14:08:13 -0400
Subject: tools: Add xdrgen

Add a Python-based tool for translating XDR specifications into XDR
encoder and decoder functions written in the Linux kernel's C coding
style. The generator attempts to match the usual C coding style of
the Linux kernel's SunRPC consumers.

This approach is similar to the netlink code generator in
tools/net/ynl .

The maintainability benefits of machine-generated XDR code include:

- Stronger type checking
- Reduces the number of bugs introduced by human error
- Makes the XDR code easier to audit and analyze
- Enables rapid prototyping of new RPC-based protocols
- Hardens the layering between protocol logic and marshaling
- Makes it easier to add observability on demand
- Unit tests might be built for both the tool and (automatically)
  for the generated code

In addition, converting the XDR layer to use memory-safe languages
such as Rust will be easier if much of the code can be converted
automatically.

Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdrgen/_builtins.h            | 243 ++++++++++
 include/linux/sunrpc/xdrgen/_defs.h                |  26 ++
 tools/net/sunrpc/xdrgen/.gitignore                 |   2 +
 tools/net/sunrpc/xdrgen/README                     | 244 ++++++++++
 tools/net/sunrpc/xdrgen/__init__.py                |   2 +
 tools/net/sunrpc/xdrgen/generators/__init__.py     | 113 +++++
 tools/net/sunrpc/xdrgen/generators/constant.py     |  20 +
 tools/net/sunrpc/xdrgen/generators/enum.py         |  44 ++
 .../net/sunrpc/xdrgen/generators/header_bottom.py  |  33 ++
 tools/net/sunrpc/xdrgen/generators/header_top.py   |  45 ++
 tools/net/sunrpc/xdrgen/generators/pointer.py      | 272 +++++++++++
 tools/net/sunrpc/xdrgen/generators/program.py      | 168 +++++++
 tools/net/sunrpc/xdrgen/generators/source_top.py   |  32 ++
 tools/net/sunrpc/xdrgen/generators/struct.py       | 272 +++++++++++
 tools/net/sunrpc/xdrgen/generators/typedef.py      | 255 +++++++++++
 tools/net/sunrpc/xdrgen/generators/union.py        | 243 ++++++++++
 tools/net/sunrpc/xdrgen/grammars/xdr.lark          | 119 +++++
 tools/net/sunrpc/xdrgen/subcmds/__init__.py        |   2 +
 tools/net/sunrpc/xdrgen/subcmds/declarations.py    |  76 +++
 tools/net/sunrpc/xdrgen/subcmds/definitions.py     |  78 ++++
 tools/net/sunrpc/xdrgen/subcmds/lint.py            |  33 ++
 tools/net/sunrpc/xdrgen/subcmds/source.py          | 118 +++++
 .../xdrgen/templates/C/constants/definition.j2     |   3 +
 .../xdrgen/templates/C/enum/declaration/close.j2   |   4 +
 .../sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 |  19 +
 .../xdrgen/templates/C/enum/definition/close.j2    |   2 +
 .../templates/C/enum/definition/enumerator.j2      |   2 +
 .../xdrgen/templates/C/enum/definition/open.j2     |   3 +
 .../sunrpc/xdrgen/templates/C/enum/encoder/enum.j2 |  14 +
 .../C/header_bottom/declaration/header.j2          |   3 +
 .../templates/C/header_bottom/definition/header.j2 |   3 +
 .../templates/C/header_top/declaration/header.j2   |  14 +
 .../templates/C/header_top/definition/header.j2    |  10 +
 .../templates/C/pointer/declaration/close.j2       |   4 +
 .../xdrgen/templates/C/pointer/decoder/basic.j2    |   6 +
 .../xdrgen/templates/C/pointer/decoder/close.j2    |   3 +
 .../C/pointer/decoder/fixed_length_array.j2        |   8 +
 .../C/pointer/decoder/fixed_length_opaque.j2       |   6 +
 .../xdrgen/templates/C/pointer/decoder/open.j2     |  22 +
 .../templates/C/pointer/decoder/optional_data.j2   |   6 +
 .../C/pointer/decoder/variable_length_array.j2     |  13 +
 .../C/pointer/decoder/variable_length_opaque.j2    |   6 +
 .../C/pointer/decoder/variable_length_string.j2    |   6 +
 .../xdrgen/templates/C/pointer/definition/basic.j2 |   5 +
 .../xdrgen/templates/C/pointer/definition/close.j2 |   2 +
 .../C/pointer/definition/fixed_length_array.j2     |   5 +
 .../C/pointer/definition/fixed_length_opaque.j2    |   5 +
 .../xdrgen/templates/C/pointer/definition/open.j2  |   6 +
 .../C/pointer/definition/optional_data.j2          |   5 +
 .../C/pointer/definition/variable_length_array.j2  |   8 +
 .../C/pointer/definition/variable_length_opaque.j2 |   5 +
 .../C/pointer/definition/variable_length_string.j2 |   5 +
 .../xdrgen/templates/C/pointer/encoder/basic.j2    |  10 +
 .../xdrgen/templates/C/pointer/encoder/close.j2    |   3 +
 .../C/pointer/encoder/fixed_length_array.j2        |  12 +
 .../C/pointer/encoder/fixed_length_opaque.j2       |   6 +
 .../xdrgen/templates/C/pointer/encoder/open.j2     |  20 +
 .../templates/C/pointer/encoder/optional_data.j2   |   6 +
 .../C/pointer/encoder/variable_length_array.j2     |  15 +
 .../C/pointer/encoder/variable_length_opaque.j2    |   8 +
 .../C/pointer/encoder/variable_length_string.j2    |   8 +
 .../templates/C/program/declaration/argument.j2    |   2 +
 .../templates/C/program/declaration/result.j2      |   2 +
 .../xdrgen/templates/C/program/decoder/argument.j2 |  21 +
 .../xdrgen/templates/C/program/decoder/result.j2   |  22 +
 .../xdrgen/templates/C/program/definition/close.j2 |   2 +
 .../xdrgen/templates/C/program/definition/open.j2  |   6 +
 .../templates/C/program/definition/procedure.j2    |   2 +
 .../xdrgen/templates/C/program/encoder/argument.j2 |  16 +
 .../xdrgen/templates/C/program/encoder/result.j2   |  21 +
 .../sunrpc/xdrgen/templates/C/source_top/client.j2 |   8 +
 .../sunrpc/xdrgen/templates/C/source_top/server.j2 |   8 +
 .../xdrgen/templates/C/struct/declaration/close.j2 |   4 +
 .../xdrgen/templates/C/struct/decoder/basic.j2     |   6 +
 .../xdrgen/templates/C/struct/decoder/close.j2     |   3 +
 .../C/struct/decoder/fixed_length_array.j2         |   8 +
 .../C/struct/decoder/fixed_length_opaque.j2        |   6 +
 .../xdrgen/templates/C/struct/decoder/open.j2      |  12 +
 .../templates/C/struct/decoder/optional_data.j2    |   6 +
 .../C/struct/decoder/variable_length_array.j2      |  13 +
 .../C/struct/decoder/variable_length_opaque.j2     |   6 +
 .../C/struct/decoder/variable_length_string.j2     |   6 +
 .../xdrgen/templates/C/struct/definition/basic.j2  |   5 +
 .../xdrgen/templates/C/struct/definition/close.j2  |   2 +
 .../C/struct/definition/fixed_length_array.j2      |   5 +
 .../C/struct/definition/fixed_length_opaque.j2     |   5 +
 .../xdrgen/templates/C/struct/definition/open.j2   |   6 +
 .../templates/C/struct/definition/optional_data.j2 |   5 +
 .../C/struct/definition/variable_length_array.j2   |   8 +
 .../C/struct/definition/variable_length_opaque.j2  |   5 +
 .../C/struct/definition/variable_length_string.j2  |   5 +
 .../xdrgen/templates/C/struct/encoder/basic.j2     |  10 +
 .../xdrgen/templates/C/struct/encoder/close.j2     |   3 +
 .../C/struct/encoder/fixed_length_array.j2         |  12 +
 .../C/struct/encoder/fixed_length_opaque.j2        |   6 +
 .../xdrgen/templates/C/struct/encoder/open.j2      |  12 +
 .../templates/C/struct/encoder/optional_data.j2    |   6 +
 .../C/struct/encoder/variable_length_array.j2      |  15 +
 .../C/struct/encoder/variable_length_opaque.j2     |   8 +
 .../C/struct/encoder/variable_length_string.j2     |   8 +
 .../templates/C/typedef/declaration/basic.j2       |   8 +
 .../C/typedef/declaration/fixed_length_array.j2    |   4 +
 .../C/typedef/declaration/fixed_length_opaque.j2   |   4 +
 .../C/typedef/declaration/variable_length_array.j2 |   4 +
 .../typedef/declaration/variable_length_opaque.j2  |   4 +
 .../typedef/declaration/variable_length_string.j2  |   4 +
 .../xdrgen/templates/C/typedef/decoder/basic.j2    |  17 +
 .../C/typedef/decoder/fixed_length_array.j2        |  25 +
 .../C/typedef/decoder/fixed_length_opaque.j2       |  17 +
 .../C/typedef/decoder/variable_length_array.j2     |  26 ++
 .../C/typedef/decoder/variable_length_opaque.j2    |  17 +
 .../C/typedef/decoder/variable_length_string.j2    |  17 +
 .../xdrgen/templates/C/typedef/definition/basic.j2 |   6 +
 .../C/typedef/definition/fixed_length_array.j2     |   6 +
 .../C/typedef/definition/fixed_length_opaque.j2    |   6 +
 .../C/typedef/definition/variable_length_array.j2  |   9 +
 .../C/typedef/definition/variable_length_opaque.j2 |   6 +
 .../C/typedef/definition/variable_length_string.j2 |   6 +
 .../xdrgen/templates/C/typedef/encoder/basic.j2    |  21 +
 .../C/typedef/encoder/fixed_length_array.j2        |  25 +
 .../C/typedef/encoder/fixed_length_opaque.j2       |  17 +
 .../C/typedef/encoder/variable_length_array.j2     |  30 ++
 .../C/typedef/encoder/variable_length_opaque.j2    |  17 +
 .../C/typedef/encoder/variable_length_string.j2    |  17 +
 .../xdrgen/templates/C/union/decoder/basic.j2      |   6 +
 .../xdrgen/templates/C/union/decoder/break.j2      |   2 +
 .../xdrgen/templates/C/union/decoder/case_spec.j2  |   2 +
 .../xdrgen/templates/C/union/decoder/close.j2      |   4 +
 .../templates/C/union/decoder/default_spec.j2      |   2 +
 .../xdrgen/templates/C/union/decoder/open.j2       |  12 +
 .../templates/C/union/decoder/optional_data.j2     |   6 +
 .../templates/C/union/decoder/switch_spec.j2       |   7 +
 .../C/union/decoder/variable_length_array.j2       |  13 +
 .../C/union/decoder/variable_length_opaque.j2      |   6 +
 .../C/union/decoder/variable_length_string.j2      |   6 +
 .../xdrgen/templates/C/union/decoder/void.j2       |   3 +
 .../templates/C/union/definition/case_spec.j2      |   2 +
 .../xdrgen/templates/C/union/definition/close.j2   |   8 +
 .../templates/C/union/definition/default_spec.j2   |   2 +
 .../xdrgen/templates/C/union/definition/open.j2    |   6 +
 .../templates/C/union/definition/switch_spec.j2    |   3 +
 .../xdrgen/templates/C/union/encoder/basic.j2      |  10 +
 .../xdrgen/templates/C/union/encoder/break.j2      |   2 +
 .../xdrgen/templates/C/union/encoder/case_spec.j2  |   2 +
 .../xdrgen/templates/C/union/encoder/close.j2      |   4 +
 .../templates/C/union/encoder/default_spec.j2      |   2 +
 .../xdrgen/templates/C/union/encoder/open.j2       |  12 +
 .../templates/C/union/encoder/switch_spec.j2       |   7 +
 .../xdrgen/templates/C/union/encoder/void.j2       |   3 +
 tools/net/sunrpc/xdrgen/tests/test.x               |  36 ++
 tools/net/sunrpc/xdrgen/xdr_ast.py                 | 510 +++++++++++++++++++++
 tools/net/sunrpc/xdrgen/xdr_parse.py               |  36 ++
 tools/net/sunrpc/xdrgen/xdrgen                     | 132 ++++++
 153 files changed, 4196 insertions(+)
 create mode 100644 include/linux/sunrpc/xdrgen/_builtins.h
 create mode 100644 include/linux/sunrpc/xdrgen/_defs.h
 create mode 100644 tools/net/sunrpc/xdrgen/.gitignore
 create mode 100644 tools/net/sunrpc/xdrgen/README
 create mode 100644 tools/net/sunrpc/xdrgen/__init__.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/__init__.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/constant.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/enum.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/header_bottom.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/header_top.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/pointer.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/program.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/source_top.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/struct.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/typedef.py
 create mode 100644 tools/net/sunrpc/xdrgen/generators/union.py
 create mode 100644 tools/net/sunrpc/xdrgen/grammars/xdr.lark
 create mode 100644 tools/net/sunrpc/xdrgen/subcmds/__init__.py
 create mode 100644 tools/net/sunrpc/xdrgen/subcmds/declarations.py
 create mode 100644 tools/net/sunrpc/xdrgen/subcmds/definitions.py
 create mode 100644 tools/net/sunrpc/xdrgen/subcmds/lint.py
 create mode 100644 tools/net/sunrpc/xdrgen/subcmds/source.py
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/constants/definition.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/declaration/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/definition/enumerator.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/definition/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/encoder/enum.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/header_bottom/declaration/header.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/header_bottom/definition/header.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/header_top/declaration/header.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/header_top/definition/header.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/declaration/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/optional_data.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/optional_data.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/optional_data.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/declaration/argument.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/declaration/result.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/decoder/result.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/definition/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/definition/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/definition/procedure.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/encoder/argument.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/source_top/server.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/declaration/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/optional_data.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/optional_data.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/optional_data.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/definition/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/definition/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/definition/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/break.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/case_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/default_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/optional_data.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/switch_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_opaque.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_string.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/void.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/definition/case_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/definition/default_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/definition/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/definition/switch_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/basic.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/break.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/case_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/default_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/open.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/switch_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/void.j2
 create mode 100644 tools/net/sunrpc/xdrgen/tests/test.x
 create mode 100644 tools/net/sunrpc/xdrgen/xdr_ast.py
 create mode 100644 tools/net/sunrpc/xdrgen/xdr_parse.py
 create mode 100755 tools/net/sunrpc/xdrgen/xdrgen

(limited to 'tools')

diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h
new file mode 100644
index 000000000000..68746c59fc9a
--- /dev/null
+++ b/include/linux/sunrpc/xdrgen/_builtins.h
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Oracle and/or its affiliates.
+ *
+ * This header defines XDR data type primitives specified in
+ * Section 4 of RFC 4506, used by RPC programs implemented
+ * in the Linux kernel.
+ */
+
+#ifndef _SUNRPC_XDRGEN__BUILTINS_H_
+#define _SUNRPC_XDRGEN__BUILTINS_H_
+
+#include <linux/sunrpc/xdr.h>
+
+static inline bool
+xdrgen_decode_void(struct xdr_stream *xdr)
+{
+	return true;
+}
+
+static inline bool
+xdrgen_encode_void(struct xdr_stream *xdr)
+{
+	return true;
+}
+
+static inline bool
+xdrgen_decode_bool(struct xdr_stream *xdr, bool *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = (*p != xdr_zero);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_bool(struct xdr_stream *xdr, bool val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = val ? xdr_one : xdr_zero;
+	return true;
+}
+
+static inline bool
+xdrgen_decode_int(struct xdr_stream *xdr, s32 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_int(struct xdr_stream *xdr, s32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_int(struct xdr_stream *xdr, u32 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_int(struct xdr_stream *xdr, u32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_long(struct xdr_stream *xdr, s32 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_long(struct xdr_stream *xdr, s32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_long(struct xdr_stream *xdr, u32 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_long(struct xdr_stream *xdr, u32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_hyper(struct xdr_stream *xdr, s64 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = get_unaligned_be64(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_hyper(struct xdr_stream *xdr, s64 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+	if (unlikely(!p))
+		return false;
+	put_unaligned_be64(val, p);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_hyper(struct xdr_stream *xdr, u64 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = get_unaligned_be64(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_hyper(struct xdr_stream *xdr, u64 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+	if (unlikely(!p))
+		return false;
+	put_unaligned_be64(val, p);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_string(struct xdr_stream *xdr, string *ptr, u32 maxlen)
+{
+	__be32 *p;
+	u32 len;
+
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT))
+		return false;
+	if (unlikely(maxlen && len > maxlen))
+		return false;
+	if (len != 0) {
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			return false;
+		ptr->data = (unsigned char *)p;
+	}
+	ptr->len = len;
+	return true;
+}
+
+static inline bool
+xdrgen_encode_string(struct xdr_stream *xdr, string val, u32 maxlen)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(val.len));
+
+	if (unlikely(!p))
+		return false;
+	xdr_encode_opaque(p, val.data, val.len);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_opaque(struct xdr_stream *xdr, opaque *ptr, u32 maxlen)
+{
+	__be32 *p;
+	u32 len;
+
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT))
+		return false;
+	if (unlikely(maxlen && len > maxlen))
+		return false;
+	if (len != 0) {
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			return false;
+		ptr->data = (u8 *)p;
+	}
+	ptr->len = len;
+	return true;
+}
+
+static inline bool
+xdrgen_encode_opaque(struct xdr_stream *xdr, opaque val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(val.len));
+
+	if (unlikely(!p))
+		return false;
+	xdr_encode_opaque(p, val.data, val.len);
+	return true;
+}
+
+#endif /* _SUNRPC_XDRGEN__BUILTINS_H_ */
diff --git a/include/linux/sunrpc/xdrgen/_defs.h b/include/linux/sunrpc/xdrgen/_defs.h
new file mode 100644
index 000000000000..be9e62371758
--- /dev/null
+++ b/include/linux/sunrpc/xdrgen/_defs.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Oracle and/or its affiliates.
+ *
+ * This header defines XDR data type primitives specified in
+ * Section 4 of RFC 4506, used by RPC programs implemented
+ * in the Linux kernel.
+ */
+
+#ifndef _SUNRPC_XDRGEN__DEFS_H_
+#define _SUNRPC_XDRGEN__DEFS_H_
+
+#define TRUE	(true)
+#define FALSE	(false)
+
+typedef struct {
+	u32 len;
+	unsigned char *data;
+} string;
+
+typedef struct {
+	u32 len;
+	u8 *data;
+} opaque;
+
+#endif /* _SUNRPC_XDRGEN__DEFS_H_ */
diff --git a/tools/net/sunrpc/xdrgen/.gitignore b/tools/net/sunrpc/xdrgen/.gitignore
new file mode 100644
index 000000000000..d7366c2f9be8
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+generators/__pycache__
diff --git a/tools/net/sunrpc/xdrgen/README b/tools/net/sunrpc/xdrgen/README
new file mode 100644
index 000000000000..92f7738ad50c
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/README
@@ -0,0 +1,244 @@
+xdrgen - Linux Kernel XDR code generator
+
+Introduction
+------------
+
+SunRPC programs are typically specified using a language defined by
+RFC 4506. In fact, all IETF-published NFS specifications provide a
+description of the specified protocol using this language.
+
+Since the 1990's, user space consumers of SunRPC have had access to
+a tool that could read such XDR specifications and then generate C
+code that implements the RPC portions of that protocol. This tool is
+called rpcgen.
+
+This RPC-level code is code that handles input directly from the
+network, and thus a high degree of memory safety and sanity checking
+is needed to help ensure proper levels of security. Bugs in this
+code can have significant impact on security and performance.
+
+However, it is code that is repetitive and tedious to write by hand.
+
+The C code generated by rpcgen makes extensive use of the facilities
+of the user space TI-RPC library and libc. Furthermore, the dialect
+of the generated code is very traditional K&R C.
+
+The Linux kernel's implementation of SunRPC-based protocols hand-roll
+their XDR implementation. There are two main reasons for this:
+
+1. libtirpc (and its predecessors) operate only in user space. The
+   kernel's RPC implementation and its API are significantly
+   different than libtirpc.
+
+2. rpcgen-generated code is believed to be less efficient than code
+   that is hand-written.
+
+These days, gcc and its kin are capable of optimizing code better
+than human authors. There are only a few instances where writing
+XDR code by hand will make a measurable performance different.
+
+In addition, the current hand-written code in the Linux kernel is
+difficult to audit and prove that it implements exactly what is in
+the protocol specification.
+
+In order to accrue the benefits of machine-generated XDR code in the
+kernel, a tool is needed that will output C code that works against
+the kernel's SunRPC implementation rather than libtirpc.
+
+Enter xdrgen.
+
+
+Dependencies
+------------
+
+These dependencies are typically packaged by Linux distributions:
+
+- python3
+- python3-lark
+- python3-jinja2
+
+These dependencies are available via PyPi:
+
+- pip install 'lark[interegular]'
+
+
+XDR Specifications
+------------------
+
+When adding a new protocol implementation to the kernel, the XDR
+specification can be derived by feeding a .txt copy of the RFC to
+the script located in tools/net/sunrpc/extract.sh.
+
+   $ extract.sh < rfc0001.txt > new2.x
+
+
+Operation
+---------
+
+Once a .x file is available, use xdrgen to generate source and
+header files containing an implementation of XDR encoding and
+decoding functions for the specified protocol.
+
+   $ ./xdrgen definitions new2.x > include/linux/sunrpc/xdrgen/new2.h
+   $ ./xdrgen declarations new2.x > new2xdr_gen.h
+
+and
+
+   $ ./xdrgen source new2.x > new2xdr_gen.c
+
+The files are ready to use for a server-side protocol implementation,
+or may be used as a guide for implementing these routines by hand.
+
+By default, the only comments added to this code are kdoc comments
+that appear directly in front of the public per-procedure APIs. For
+deeper introspection, specifying the "--annotate" flag will insert
+additional comments in the generated code to help readers match the
+generated code to specific parts of the XDR specification.
+
+Because the generated code is targeted for the Linux kernel, it
+is tagged with a GPLv2-only license.
+
+The xdrgen tool can also provide lexical and syntax checking of
+an XDR specification:
+
+   $ ./xdrgen lint xdr/new.x
+
+
+How It Works
+------------
+
+xdrgen does not use machine learning to generate source code. The
+translation is entirely deterministic.
+
+RFC 4506 Section 6 contains a BNF grammar of the XDR specification
+language. The grammar has been adapted for use by the Python Lark
+module.
+
+The xdr.ebnf file in this directory contains the grammar used to
+parse XDR specifications. xdrgen configures Lark using the grammar
+in xdr.ebnf. Lark parses the target XDR specification using this
+grammar, creating a parse tree.
+
+xdrgen then transforms the parse tree into an abstract syntax tree.
+This tree is passed to a series of code generators.
+
+The generators are implemented as Python classes residing in the
+generators/ directory. Each generator emits code created from Jinja2
+templates stored in the templates/ directory.
+
+The source code is generated in the same order in which they appear
+in the specification to ensure the generated code compiles. This
+conforms with the behavior of rpcgen.
+
+xdrgen assumes that the generated source code is further compiled by
+a compiler that can optimize in a number of ways, including:
+
+ - Unused functions are discarded (ie, not added to the executable)
+
+ - Aggressive function inlining removes unnecessary stack frames
+
+ - Single-arm switch statements are replaced by a single conditional
+   branch
+
+And so on.
+
+
+Pragmas
+-------
+
+Pragma directives specify exceptions to the normal generation of
+encoding and decoding functions. Currently one directive is
+implemented: "public".
+
+Pragma exclude
+------ -------
+
+  pragma exclude <RPC procedure> ;
+
+In some cases, a procedure encoder or decoder function might need
+special processing that cannot be automatically generated. The
+automatically-generated functions might conflict or interfere with
+the hand-rolled function. To avoid editing the generated source code
+by hand, a pragma can specify that the procedure's encoder and
+decoder functions are not included in the generated header and
+source.
+
+For example:
+
+  pragma exclude NFSPROC3_READDIRPLUS;
+
+Excludes the decoder function for the READDIRPLUS argument and the
+encoder function for the READDIRPLUS result.
+
+Note that because data item encoder and decoder functions are
+defined "static __maybe_unused", subsequent compilation
+automatically excludes data item encoder and decoder functions that
+are used only by excluded procedure.
+
+Pragma header
+------ ------
+
+  pragma header <string> ;
+
+Provide a name to use for the header file. For example:
+
+  pragma header nlm4;
+
+Adds
+
+  #include "nlm4xdr_gen.h"
+
+to the generated source file.
+
+Pragma public
+------ ------
+
+  pragma public <XDR data item> ;
+
+Normally XDR encoder and decoder functions are "static". In case an
+implementer wants to call these functions from other source code,
+s/he can add a public pragma in the input .x file to indicate a set
+of functions that should get a prototype in the generated header,
+and the function definitions will not be declared static.
+
+For example:
+
+  pragma public nfsstat3;
+
+Adds these prototypes in the generated header:
+
+  bool xdrgen_decode_nfsstat3(struct xdr_stream *xdr, enum nfsstat3 *ptr);
+  bool xdrgen_encode_nfsstat3(struct xdr_stream *xdr, enum nfsstat3 value);
+
+And, in the generated source code, both of these functions appear
+without the "static __maybe_unused" modifiers.
+
+
+Future Work
+-----------
+
+Finish implementing XDR pointer and list types.
+
+Generate client-side procedure functions
+
+Expand the README into a user guide similar to rpcgen(1)
+
+Add more pragma directives:
+
+  * @pages -- use xdr_read/write_pages() for the specified opaque
+    field
+  * @skip -- do not decode, but rather skip, the specified argument
+    field
+
+Enable something like a #include to dynamically insert the content
+of other specification files
+
+Properly support line-by-line pass-through via the "%" decorator
+
+Build a unit test suite for verifying translation of XDR language
+into compilable code
+
+Add a command-line option to insert trace_printk call sites in the
+generated source code, for improved (temporary) observability
+
+Generate kernel Rust code as well as C code
diff --git a/tools/net/sunrpc/xdrgen/__init__.py b/tools/net/sunrpc/xdrgen/__init__.py
new file mode 100644
index 000000000000..c940e9275252
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+# Just to make sphinx-apidoc document this directory
diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py
new file mode 100644
index 000000000000..fd2457461274
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/__init__.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: GPL-2.0
+
+"""Define a base code generator class"""
+
+import sys
+from jinja2 import Environment, FileSystemLoader, Template
+
+from xdr_ast import _XdrAst, Specification, _RpcProgram, _XdrTypeSpecifier
+from xdr_ast import public_apis, pass_by_reference, get_header_name
+from xdr_parse import get_xdr_annotate
+
+
+def create_jinja2_environment(language: str, xdr_type: str) -> Environment:
+    """Open a set of templates based on output language"""
+    match language:
+        case "C":
+            environment = Environment(
+                loader=FileSystemLoader(sys.path[0] + "/templates/C/" + xdr_type + "/"),
+                trim_blocks=True,
+                lstrip_blocks=True,
+            )
+            environment.globals["annotate"] = get_xdr_annotate()
+            environment.globals["public_apis"] = public_apis
+            environment.globals["pass_by_reference"] = pass_by_reference
+            return environment
+        case _:
+            raise NotImplementedError("Language not supported")
+
+
+def get_jinja2_template(
+    environment: Environment, template_type: str, template_name: str
+) -> Template:
+    """Retrieve a Jinja2 template for emitting source code"""
+    return environment.get_template(template_type + "/" + template_name + ".j2")
+
+
+def find_xdr_program_name(root: Specification) -> str:
+    """Retrieve the RPC program name from an abstract syntax tree"""
+    raw_name = get_header_name()
+    if raw_name != "none":
+        return raw_name.lower()
+    for definition in root.definitions:
+        if isinstance(definition.value, _RpcProgram):
+            raw_name = definition.value.name
+            return raw_name.lower().removesuffix("_program").removesuffix("_prog")
+    return "noprog"
+
+
+def header_guard_infix(filename: str) -> str:
+    """Extract the header guard infix from the specification filename"""
+    basename = filename.split("/")[-1]
+    program = basename.replace(".x", "")
+    return program.upper()
+
+
+def kernel_c_type(spec: _XdrTypeSpecifier) -> str:
+    """Return name of C type"""
+    builtin_native_c_type = {
+        "bool": "bool",
+        "int": "s32",
+        "unsigned_int": "u32",
+        "long": "s32",
+        "unsigned_long": "u32",
+        "hyper": "s64",
+        "unsigned_hyper": "u64",
+    }
+    if spec.type_name in builtin_native_c_type:
+        return builtin_native_c_type[spec.type_name]
+    return spec.type_name
+
+
+class Boilerplate:
+    """Base class to generate boilerplate for source files"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        raise NotImplementedError("No language support defined")
+
+    def emit_declaration(self, filename: str, root: Specification) -> None:
+        """Emit declaration header boilerplate"""
+        raise NotImplementedError("Header boilerplate generation not supported")
+
+    def emit_definition(self, filename: str, root: Specification) -> None:
+        """Emit definition header boilerplate"""
+        raise NotImplementedError("Header boilerplate generation not supported")
+
+    def emit_source(self, filename: str, root: Specification) -> None:
+        """Emit generic source code for this XDR type"""
+        raise NotImplementedError("Source boilerplate generation not supported")
+
+
+class SourceGenerator:
+    """Base class to generate header and source code for XDR types"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        raise NotImplementedError("No language support defined")
+
+    def emit_declaration(self, node: _XdrAst) -> None:
+        """Emit one function declaration for this XDR type"""
+        raise NotImplementedError("Declaration generation not supported")
+
+    def emit_decoder(self, node: _XdrAst) -> None:
+        """Emit one decoder function for this XDR type"""
+        raise NotImplementedError("Decoder generation not supported")
+
+    def emit_definition(self, node: _XdrAst) -> None:
+        """Emit one definition for this XDR type"""
+        raise NotImplementedError("Definition generation not supported")
+
+    def emit_encoder(self, node: _XdrAst) -> None:
+        """Emit one encoder function for this XDR type"""
+        raise NotImplementedError("Encoder generation not supported")
diff --git a/tools/net/sunrpc/xdrgen/generators/constant.py b/tools/net/sunrpc/xdrgen/generators/constant.py
new file mode 100644
index 000000000000..f2339caf0953
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/constant.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code to handle XDR constants"""
+
+from generators import SourceGenerator, create_jinja2_environment
+from xdr_ast import _XdrConstant
+
+class XdrConstantGenerator(SourceGenerator):
+    """Generate source code for XDR constants"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "constants")
+        self.peer = peer
+
+    def emit_definition(self, node: _XdrConstant) -> None:
+        """Emit one definition for a constant"""
+        template = self.environment.get_template("definition.j2")
+        print(template.render(name=node.name, value=node.value))
diff --git a/tools/net/sunrpc/xdrgen/generators/enum.py b/tools/net/sunrpc/xdrgen/generators/enum.py
new file mode 100644
index 000000000000..855e43f4ae38
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/enum.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code to handle XDR enum types"""
+
+from generators import SourceGenerator, create_jinja2_environment
+from xdr_ast import _XdrEnum, public_apis
+
+
+class XdrEnumGenerator(SourceGenerator):
+    """Generate source code for XDR enum types"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "enum")
+        self.peer = peer
+
+    def emit_declaration(self, node: _XdrEnum) -> None:
+        """Emit one declaration pair for an XDR enum type"""
+        if node.name in public_apis:
+            template = self.environment.get_template("declaration/close.j2")
+            print(template.render(name=node.name))
+
+    def emit_definition(self, node: _XdrEnum) -> None:
+        """Emit one definition for an XDR enum type"""
+        template = self.environment.get_template("definition/open.j2")
+        print(template.render(name=node.name))
+
+        template = self.environment.get_template("definition/enumerator.j2")
+        for enumerator in node.enumerators:
+            print(template.render(name=enumerator.name, value=enumerator.value))
+
+        template = self.environment.get_template("definition/close.j2")
+        print(template.render(name=node.name))
+
+    def emit_decoder(self, node: _XdrEnum) -> None:
+        """Emit one decoder function for an XDR enum type"""
+        template = self.environment.get_template("decoder/enum.j2")
+        print(template.render(name=node.name))
+
+    def emit_encoder(self, node: _XdrEnum) -> None:
+        """Emit one encoder function for an XDR enum type"""
+        template = self.environment.get_template("encoder/enum.j2")
+        print(template.render(name=node.name))
diff --git a/tools/net/sunrpc/xdrgen/generators/header_bottom.py b/tools/net/sunrpc/xdrgen/generators/header_bottom.py
new file mode 100644
index 000000000000..4b55b282dfc0
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/header_bottom.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate header bottom boilerplate"""
+
+import os.path
+import time
+
+from generators import Boilerplate, header_guard_infix
+from generators import create_jinja2_environment, get_jinja2_template
+from xdr_ast import Specification
+
+
+class XdrHeaderBottomGenerator(Boilerplate):
+    """Generate header boilerplate"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "header_bottom")
+        self.peer = peer
+
+    def emit_declaration(self, filename: str, root: Specification) -> None:
+        """Emit the bottom header guard"""
+        template = get_jinja2_template(self.environment, "declaration", "header")
+        print(template.render(infix=header_guard_infix(filename)))
+
+    def emit_definition(self, filename: str, root: Specification) -> None:
+        """Emit the bottom header guard"""
+        template = get_jinja2_template(self.environment, "definition", "header")
+        print(template.render(infix=header_guard_infix(filename)))
+
+    def emit_source(self, filename: str, root: Specification) -> None:
+        pass
diff --git a/tools/net/sunrpc/xdrgen/generators/header_top.py b/tools/net/sunrpc/xdrgen/generators/header_top.py
new file mode 100644
index 000000000000..c6bc21c71f19
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/header_top.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate header top boilerplate"""
+
+import os.path
+import time
+
+from generators import Boilerplate, header_guard_infix
+from generators import create_jinja2_environment, get_jinja2_template
+from xdr_ast import Specification
+
+
+class XdrHeaderTopGenerator(Boilerplate):
+    """Generate header boilerplate"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "header_top")
+        self.peer = peer
+
+    def emit_declaration(self, filename: str, root: Specification) -> None:
+        """Emit the top header guard"""
+        template = get_jinja2_template(self.environment, "declaration", "header")
+        print(
+            template.render(
+                infix=header_guard_infix(filename),
+                filename=filename,
+                mtime=time.ctime(os.path.getmtime(filename)),
+            )
+        )
+
+    def emit_definition(self, filename: str, root: Specification) -> None:
+        """Emit the top header guard"""
+        template = get_jinja2_template(self.environment, "definition", "header")
+        print(
+            template.render(
+                infix=header_guard_infix(filename),
+                filename=filename,
+                mtime=time.ctime(os.path.getmtime(filename)),
+            )
+        )
+
+    def emit_source(self, filename: str, root: Specification) -> None:
+        pass
diff --git a/tools/net/sunrpc/xdrgen/generators/pointer.py b/tools/net/sunrpc/xdrgen/generators/pointer.py
new file mode 100644
index 000000000000..b0b27f1819c8
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/pointer.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code to handle XDR pointer types"""
+
+from jinja2 import Environment
+
+from generators import SourceGenerator, kernel_c_type
+from generators import create_jinja2_environment, get_jinja2_template
+
+from xdr_ast import _XdrBasic, _XdrVariableLengthString
+from xdr_ast import _XdrFixedLengthOpaque, _XdrVariableLengthOpaque
+from xdr_ast import _XdrFixedLengthArray, _XdrVariableLengthArray
+from xdr_ast import _XdrOptionalData, _XdrPointer, _XdrDeclaration
+from xdr_ast import public_apis
+
+
+def emit_pointer_declaration(environment: Environment, node: _XdrPointer) -> None:
+    """Emit a declaration pair for an XDR pointer type"""
+    if node.name in public_apis:
+        template = get_jinja2_template(environment, "declaration", "close")
+        print(template.render(name=node.name))
+
+
+def emit_pointer_member_definition(
+    environment: Environment, field: _XdrDeclaration
+) -> None:
+    """Emit a definition for one field in an XDR struct"""
+    if isinstance(field, _XdrBasic):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=kernel_c_type(field.spec),
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(template.render(name=field.name))
+    elif isinstance(field, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(template.render(name=field.name))
+    elif isinstance(field, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=kernel_c_type(field.spec),
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=kernel_c_type(field.spec),
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrOptionalData):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=kernel_c_type(field.spec),
+                classifier=field.spec.c_classifier,
+            )
+        )
+
+
+def emit_pointer_definition(environment: Environment, node: _XdrPointer) -> None:
+    """Emit a definition for an XDR pointer type"""
+    template = get_jinja2_template(environment, "definition", "open")
+    print(template.render(name=node.name))
+
+    for field in node.fields[0:-1]:
+        emit_pointer_member_definition(environment, field)
+
+    template = get_jinja2_template(environment, "definition", "close")
+    print(template.render(name=node.name))
+
+
+def emit_pointer_member_decoder(
+    environment: Environment, field: _XdrDeclaration
+) -> None:
+    """Emit a decoder for one field in an XDR pointer"""
+    if isinstance(field, _XdrBasic):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                size=field.size,
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                maxsize=field.maxsize,
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrOptionalData):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                classifier=field.spec.c_classifier,
+            )
+        )
+
+
+def emit_pointer_decoder(environment: Environment, node: _XdrPointer) -> None:
+    """Emit one decoder function for an XDR pointer type"""
+    template = get_jinja2_template(environment, "decoder", "open")
+    print(template.render(name=node.name))
+
+    for field in node.fields[0:-1]:
+        emit_pointer_member_decoder(environment, field)
+
+    template = get_jinja2_template(environment, "decoder", "close")
+    print(template.render())
+
+
+def emit_pointer_member_encoder(
+    environment: Environment, field: _XdrDeclaration
+) -> None:
+    """Emit an encoder for one field in a XDR pointer"""
+    if isinstance(field, _XdrBasic):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrOptionalData):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                classifier=field.spec.c_classifier,
+            )
+        )
+
+
+def emit_pointer_encoder(environment: Environment, node: _XdrPointer) -> None:
+    """Emit one encoder function for an XDR pointer type"""
+    template = get_jinja2_template(environment, "encoder", "open")
+    print(template.render(name=node.name))
+
+    for field in node.fields[0:-1]:
+        emit_pointer_member_encoder(environment, field)
+
+    template = get_jinja2_template(environment, "encoder", "close")
+    print(template.render())
+
+
+class XdrPointerGenerator(SourceGenerator):
+    """Generate source code for XDR pointer"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "pointer")
+        self.peer = peer
+
+    def emit_declaration(self, node: _XdrPointer) -> None:
+        """Emit one declaration pair for an XDR pointer type"""
+        emit_pointer_declaration(self.environment, node)
+
+    def emit_definition(self, node: _XdrPointer) -> None:
+        """Emit one declaration for an XDR pointer type"""
+        emit_pointer_definition(self.environment, node)
+
+    def emit_decoder(self, node: _XdrPointer) -> None:
+        """Emit one decoder function for an XDR pointer type"""
+        emit_pointer_decoder(self.environment, node)
+
+    def emit_encoder(self, node: _XdrPointer) -> None:
+        """Emit one encoder function for an XDR pointer type"""
+        emit_pointer_encoder(self.environment, node)
diff --git a/tools/net/sunrpc/xdrgen/generators/program.py b/tools/net/sunrpc/xdrgen/generators/program.py
new file mode 100644
index 000000000000..83b0ecbae86f
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/program.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code for an RPC program's procedures"""
+
+from jinja2 import Environment
+
+from generators import SourceGenerator, create_jinja2_environment
+from xdr_ast import _RpcProgram, _RpcVersion, excluded_apis
+
+
+def emit_version_definitions(
+    environment: Environment, program: str, version: _RpcVersion
+) -> None:
+    """Emit procedure numbers for each RPC version's procedures"""
+    template = environment.get_template("definition/open.j2")
+    print(template.render(program=program.upper()))
+
+    template = environment.get_template("definition/procedure.j2")
+    for procedure in version.procedures:
+        if procedure.name not in excluded_apis:
+            print(
+                template.render(
+                    name=procedure.name,
+                    value=procedure.number,
+                )
+            )
+
+    template = environment.get_template("definition/close.j2")
+    print(template.render())
+
+
+def emit_version_declarations(
+    environment: Environment, program: str, version: _RpcVersion
+) -> None:
+    """Emit declarations for each RPC version's procedures"""
+    arguments = set()
+    for procedure in version.procedures:
+        if procedure.name not in excluded_apis:
+            arguments.add(procedure.argument.type_name)
+    if len(arguments) > 0:
+        print("")
+        template = environment.get_template("declaration/argument.j2")
+        for argument in arguments:
+            print(template.render(program=program, argument=argument))
+
+    results = set()
+    for procedure in version.procedures:
+        if procedure.name not in excluded_apis:
+            results.add(procedure.result.type_name)
+    if len(results) > 0:
+        print("")
+        template = environment.get_template("declaration/result.j2")
+        for result in results:
+            print(template.render(program=program, result=result))
+
+
+def emit_version_argument_decoders(
+    environment: Environment, program: str, version: _RpcVersion
+) -> None:
+    """Emit server argument decoders for each RPC version's procedures"""
+    arguments = set()
+    for procedure in version.procedures:
+        if procedure.name not in excluded_apis:
+            arguments.add(procedure.argument.type_name)
+
+    template = environment.get_template("decoder/argument.j2")
+    for argument in arguments:
+        print(template.render(program=program, argument=argument))
+
+
+def emit_version_result_decoders(
+    environment: Environment, program: str, version: _RpcVersion
+) -> None:
+    """Emit client result decoders for each RPC version's procedures"""
+    results = set()
+    for procedure in version.procedures:
+        if procedure.name not in excluded_apis:
+            results.add(procedure.result.type_name)
+
+    template = environment.get_template("decoder/result.j2")
+    for result in results:
+        print(template.render(program=program, result=result))
+
+
+def emit_version_argument_encoders(
+    environment: Environment, program: str, version: _RpcVersion
+) -> None:
+    """Emit client argument encoders for each RPC version's procedures"""
+    arguments = set()
+    for procedure in version.procedures:
+        if procedure.name not in excluded_apis:
+            arguments.add(procedure.argument.type_name)
+
+    template = environment.get_template("encoder/argument.j2")
+    for argument in arguments:
+        print(template.render(program=program, argument=argument))
+
+
+def emit_version_result_encoders(
+    environment: Environment, program: str, version: _RpcVersion
+) -> None:
+    """Emit server result encoders for each RPC version's procedures"""
+    results = set()
+    for procedure in version.procedures:
+        if procedure.name not in excluded_apis:
+            results.add(procedure.result.type_name)
+
+    template = environment.get_template("encoder/result.j2")
+    for result in results:
+        print(template.render(program=program, result=result))
+
+
+class XdrProgramGenerator(SourceGenerator):
+    """Generate source code for an RPC program's procedures"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "program")
+        self.peer = peer
+
+    def emit_definition(self, node: _RpcProgram) -> None:
+        """Emit procedure numbers for each of an RPC programs's procedures"""
+        raw_name = node.name
+        program = raw_name.lower().removesuffix("_program").removesuffix("_prog")
+
+        for version in node.versions:
+            emit_version_definitions(self.environment, program, version)
+
+    def emit_declaration(self, node: _RpcProgram) -> None:
+        """Emit a declaration pair for each of an RPC programs's procedures"""
+        raw_name = node.name
+        program = raw_name.lower().removesuffix("_program").removesuffix("_prog")
+
+        for version in node.versions:
+            emit_version_declarations(self.environment, program, version)
+
+    def emit_decoder(self, node: _RpcProgram) -> None:
+        """Emit all decoder functions for an RPC program's procedures"""
+        raw_name = node.name
+        program = raw_name.lower().removesuffix("_program").removesuffix("_prog")
+        match self.peer:
+            case "server":
+                for version in node.versions:
+                    emit_version_argument_decoders(
+                        self.environment, program, version,
+                    )
+            case "client":
+                for version in node.versions:
+                    emit_version_result_decoders(
+                        self.environment, program, version,
+                    )
+
+    def emit_encoder(self, node: _RpcProgram) -> None:
+        """Emit all encoder functions for an RPC program's procedures"""
+        raw_name = node.name
+        program = raw_name.lower().removesuffix("_program").removesuffix("_prog")
+        match self.peer:
+            case "server":
+                for version in node.versions:
+                    emit_version_result_encoders(
+                        self.environment, program, version,
+                    )
+            case "client":
+                for version in node.versions:
+                    emit_version_argument_encoders(
+                        self.environment, program, version,
+                    )
diff --git a/tools/net/sunrpc/xdrgen/generators/source_top.py b/tools/net/sunrpc/xdrgen/generators/source_top.py
new file mode 100644
index 000000000000..bcf47d93d6f1
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/source_top.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate source code boilerplate"""
+
+import os.path
+import time
+
+from generators import Boilerplate
+from generators import find_xdr_program_name, create_jinja2_environment
+from xdr_ast import _RpcProgram, Specification, get_header_name
+
+
+class XdrSourceTopGenerator(Boilerplate):
+    """Generate source code boilerplate"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "source_top")
+        self.peer = peer
+
+    def emit_source(self, filename: str, root: Specification) -> None:
+        """Emit the top source boilerplate"""
+        name = find_xdr_program_name(root)
+        template = self.environment.get_template(self.peer + ".j2")
+        print(
+            template.render(
+                program=name,
+                filename=filename,
+                mtime=time.ctime(os.path.getmtime(filename)),
+            )
+        )
diff --git a/tools/net/sunrpc/xdrgen/generators/struct.py b/tools/net/sunrpc/xdrgen/generators/struct.py
new file mode 100644
index 000000000000..b694cd470829
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/struct.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code to handle XDR struct types"""
+
+from jinja2 import Environment
+
+from generators import SourceGenerator, kernel_c_type
+from generators import create_jinja2_environment, get_jinja2_template
+
+from xdr_ast import _XdrBasic, _XdrVariableLengthString
+from xdr_ast import _XdrFixedLengthOpaque, _XdrVariableLengthOpaque
+from xdr_ast import _XdrFixedLengthArray, _XdrVariableLengthArray
+from xdr_ast import _XdrOptionalData, _XdrStruct, _XdrDeclaration
+from xdr_ast import public_apis
+
+
+def emit_struct_declaration(environment: Environment, node: _XdrStruct) -> None:
+    """Emit one declaration pair for an XDR struct type"""
+    if node.name in public_apis:
+        template = get_jinja2_template(environment, "declaration", "close")
+        print(template.render(name=node.name))
+
+
+def emit_struct_member_definition(
+    environment: Environment, field: _XdrDeclaration
+) -> None:
+    """Emit a definition for one field in an XDR struct"""
+    if isinstance(field, _XdrBasic):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=kernel_c_type(field.spec),
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(template.render(name=field.name))
+    elif isinstance(field, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(template.render(name=field.name))
+    elif isinstance(field, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=kernel_c_type(field.spec),
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=kernel_c_type(field.spec),
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrOptionalData):
+        template = get_jinja2_template(environment, "definition", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=kernel_c_type(field.spec),
+                classifier=field.spec.c_classifier,
+            )
+        )
+
+
+def emit_struct_definition(environment: Environment, node: _XdrStruct) -> None:
+    """Emit one definition for an XDR struct type"""
+    template = get_jinja2_template(environment, "definition", "open")
+    print(template.render(name=node.name))
+
+    for field in node.fields:
+        emit_struct_member_definition(environment, field)
+
+    template = get_jinja2_template(environment, "definition", "close")
+    print(template.render(name=node.name))
+
+
+def emit_struct_member_decoder(
+    environment: Environment, field: _XdrDeclaration
+) -> None:
+    """Emit a decoder for one field in an XDR struct"""
+    if isinstance(field, _XdrBasic):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                size=field.size,
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                maxsize=field.maxsize,
+                classifier=field.spec.c_classifier,
+            )
+        )
+    elif isinstance(field, _XdrOptionalData):
+        template = get_jinja2_template(environment, "decoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                classifier=field.spec.c_classifier,
+            )
+        )
+
+
+def emit_struct_decoder(environment: Environment, node: _XdrStruct) -> None:
+    """Emit one decoder function for an XDR struct type"""
+    template = get_jinja2_template(environment, "decoder", "open")
+    print(template.render(name=node.name))
+
+    for field in node.fields:
+        emit_struct_member_decoder(environment, field)
+
+    template = get_jinja2_template(environment, "decoder", "close")
+    print(template.render())
+
+
+def emit_struct_member_encoder(
+    environment: Environment, field: _XdrDeclaration
+) -> None:
+    """Emit an encoder for one field in an XDR struct"""
+    if isinstance(field, _XdrBasic):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                size=field.size,
+            )
+        )
+    elif isinstance(field, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                maxsize=field.maxsize,
+            )
+        )
+    elif isinstance(field, _XdrOptionalData):
+        template = get_jinja2_template(environment, "encoder", field.template)
+        print(
+            template.render(
+                name=field.name,
+                type=field.spec.type_name,
+                classifier=field.spec.c_classifier,
+            )
+        )
+
+
+def emit_struct_encoder(environment: Environment, node: _XdrStruct) -> None:
+    """Emit one encoder function for an XDR struct type"""
+    template = get_jinja2_template(environment, "encoder", "open")
+    print(template.render(name=node.name))
+
+    for field in node.fields:
+        emit_struct_member_encoder(environment, field)
+
+    template = get_jinja2_template(environment, "encoder", "close")
+    print(template.render())
+
+
+class XdrStructGenerator(SourceGenerator):
+    """Generate source code for XDR structs"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "struct")
+        self.peer = peer
+
+    def emit_declaration(self, node: _XdrStruct) -> None:
+        """Emit one declaration pair for an XDR struct type"""
+        emit_struct_declaration(self.environment, node)
+
+    def emit_definition(self, node: _XdrStruct) -> None:
+        """Emit one definition for an XDR struct type"""
+        emit_struct_definition(self.environment, node)
+
+    def emit_decoder(self, node: _XdrStruct) -> None:
+        """Emit one decoder function for an XDR struct type"""
+        emit_struct_decoder(self.environment, node)
+
+    def emit_encoder(self, node: _XdrStruct) -> None:
+        """Emit one encoder function for an XDR struct type"""
+        emit_struct_encoder(self.environment, node)
diff --git a/tools/net/sunrpc/xdrgen/generators/typedef.py b/tools/net/sunrpc/xdrgen/generators/typedef.py
new file mode 100644
index 000000000000..85a1b2303333
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/typedef.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code to handle XDR typedefs"""
+
+from jinja2 import Environment
+
+from generators import SourceGenerator, kernel_c_type
+from generators import create_jinja2_environment, get_jinja2_template
+
+from xdr_ast import _XdrBasic, _XdrTypedef, _XdrVariableLengthString
+from xdr_ast import _XdrFixedLengthOpaque, _XdrVariableLengthOpaque
+from xdr_ast import _XdrFixedLengthArray, _XdrVariableLengthArray
+from xdr_ast import _XdrOptionalData, _XdrVoid, _XdrDeclaration
+from xdr_ast import public_apis
+
+
+def emit_typedef_declaration(environment: Environment, node: _XdrDeclaration) -> None:
+    """Emit a declaration pair for one XDR typedef"""
+    if node.name not in public_apis:
+        return
+    if isinstance(node, _XdrBasic):
+        template = get_jinja2_template(environment, "declaration", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=kernel_c_type(node.spec),
+                classifier=node.spec.c_classifier,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "declaration", node.template)
+        print(template.render(name=node.name))
+    elif isinstance(node, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "declaration", node.template)
+        print(template.render(name=node.name, size=node.size))
+    elif isinstance(node, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "declaration", node.template)
+        print(template.render(name=node.name))
+    elif isinstance(node, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "declaration", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+                size=node.size,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "declaration", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+                classifier=node.spec.c_classifier,
+            )
+        )
+    elif isinstance(node, _XdrOptionalData):
+        raise NotImplementedError("<optional_data> typedef not yet implemented")
+    elif isinstance(node, _XdrVoid):
+        raise NotImplementedError("<void> typedef not yet implemented")
+    else:
+        raise NotImplementedError("typedef: type not recognized")
+
+
+def emit_type_definition(environment: Environment, node: _XdrDeclaration) -> None:
+    """Emit a definition for one XDR typedef"""
+    if isinstance(node, _XdrBasic):
+        template = get_jinja2_template(environment, "definition", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=kernel_c_type(node.spec),
+                classifier=node.spec.c_classifier,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "definition", node.template)
+        print(template.render(name=node.name))
+    elif isinstance(node, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "definition", node.template)
+        print(template.render(name=node.name, size=node.size))
+    elif isinstance(node, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "definition", node.template)
+        print(template.render(name=node.name))
+    elif isinstance(node, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "definition", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+                size=node.size,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "definition", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+                classifier=node.spec.c_classifier,
+            )
+        )
+    elif isinstance(node, _XdrOptionalData):
+        raise NotImplementedError("<optional_data> typedef not yet implemented")
+    elif isinstance(node, _XdrVoid):
+        raise NotImplementedError("<void> typedef not yet implemented")
+    else:
+        raise NotImplementedError("typedef: type not recognized")
+
+
+def emit_typedef_decoder(environment: Environment, node: _XdrDeclaration) -> None:
+    """Emit a decoder function for one XDR typedef"""
+    if isinstance(node, _XdrBasic):
+        template = get_jinja2_template(environment, "decoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "decoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                maxsize=node.maxsize,
+            )
+        )
+    elif isinstance(node, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "decoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                size=node.size,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "decoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                maxsize=node.maxsize,
+            )
+        )
+    elif isinstance(node, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "decoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+                size=node.size,
+                classifier=node.spec.c_classifier,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "decoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+                maxsize=node.maxsize,
+            )
+        )
+    elif isinstance(node, _XdrOptionalData):
+        raise NotImplementedError("<optional_data> typedef not yet implemented")
+    elif isinstance(node, _XdrVoid):
+        raise NotImplementedError("<void> typedef not yet implemented")
+    else:
+        raise NotImplementedError("typedef: type not recognized")
+
+
+def emit_typedef_encoder(environment: Environment, node: _XdrDeclaration) -> None:
+    """Emit an encoder function for one XDR typedef"""
+    if isinstance(node, _XdrBasic):
+        template = get_jinja2_template(environment, "encoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthString):
+        template = get_jinja2_template(environment, "encoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                maxsize=node.maxsize,
+            )
+        )
+    elif isinstance(node, _XdrFixedLengthOpaque):
+        template = get_jinja2_template(environment, "encoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                size=node.size,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthOpaque):
+        template = get_jinja2_template(environment, "encoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                maxsize=node.maxsize,
+            )
+        )
+    elif isinstance(node, _XdrFixedLengthArray):
+        template = get_jinja2_template(environment, "encoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+                size=node.size,
+            )
+        )
+    elif isinstance(node, _XdrVariableLengthArray):
+        template = get_jinja2_template(environment, "encoder", node.template)
+        print(
+            template.render(
+                name=node.name,
+                type=node.spec.type_name,
+                maxsize=node.maxsize,
+            )
+        )
+    elif isinstance(node, _XdrOptionalData):
+        raise NotImplementedError("<optional_data> typedef not yet implemented")
+    elif isinstance(node, _XdrVoid):
+        raise NotImplementedError("<void> typedef not yet implemented")
+    else:
+        raise NotImplementedError("typedef: type not recognized")
+
+
+class XdrTypedefGenerator(SourceGenerator):
+    """Generate source code for XDR typedefs"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "typedef")
+        self.peer = peer
+
+    def emit_declaration(self, node: _XdrTypedef) -> None:
+        """Emit one declaration pair for an XDR enum type"""
+        emit_typedef_declaration(self.environment, node.declaration)
+
+    def emit_definition(self, node: _XdrTypedef) -> None:
+        """Emit one definition for an XDR typedef"""
+        emit_type_definition(self.environment, node.declaration)
+
+    def emit_decoder(self, node: _XdrTypedef) -> None:
+        """Emit one decoder function for an XDR typedef"""
+        emit_typedef_decoder(self.environment, node.declaration)
+
+    def emit_encoder(self, node: _XdrTypedef) -> None:
+        """Emit one encoder function for an XDR typedef"""
+        emit_typedef_encoder(self.environment, node.declaration)
diff --git a/tools/net/sunrpc/xdrgen/generators/union.py b/tools/net/sunrpc/xdrgen/generators/union.py
new file mode 100644
index 000000000000..7974967bbb9f
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/union.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code to handle XDR unions"""
+
+from jinja2 import Environment
+
+from generators import SourceGenerator
+from generators import create_jinja2_environment, get_jinja2_template
+
+from xdr_ast import _XdrBasic, _XdrUnion, _XdrVoid
+from xdr_ast import _XdrDeclaration, _XdrCaseSpec, public_apis
+
+
+def emit_union_declaration(environment: Environment, node: _XdrUnion) -> None:
+    """Emit one declaration pair for an XDR union type"""
+    if node.name in public_apis:
+        template = get_jinja2_template(environment, "declaration", "close")
+        print(template.render(name=node.name))
+
+
+def emit_union_switch_spec_definition(
+    environment: Environment, node: _XdrDeclaration
+) -> None:
+    """Emit a definition for an XDR union's discriminant"""
+    assert isinstance(node, _XdrBasic)
+    template = get_jinja2_template(environment, "definition", "switch_spec")
+    print(
+        template.render(
+            name=node.name,
+            type=node.spec.type_name,
+            classifier=node.spec.c_classifier,
+        )
+    )
+
+
+def emit_union_case_spec_definition(
+    environment: Environment, node: _XdrDeclaration
+) -> None:
+    """Emit a definition for an XDR union's case arm"""
+    if isinstance(node.arm, _XdrVoid):
+        return
+    assert isinstance(node.arm, _XdrBasic)
+    template = get_jinja2_template(environment, "definition", "case_spec")
+    print(
+        template.render(
+            name=node.arm.name,
+            type=node.arm.spec.type_name,
+            classifier=node.arm.spec.c_classifier,
+        )
+    )
+
+
+def emit_union_definition(environment: Environment, node: _XdrUnion) -> None:
+    """Emit one XDR union definition"""
+    template = get_jinja2_template(environment, "definition", "open")
+    print(template.render(name=node.name))
+
+    emit_union_switch_spec_definition(environment, node.discriminant)
+
+    for case in node.cases:
+        emit_union_case_spec_definition(environment, case)
+
+    if node.default is not None:
+        emit_union_case_spec_definition(environment, node.default)
+
+    template = get_jinja2_template(environment, "definition", "close")
+    print(template.render(name=node.name))
+
+
+def emit_union_switch_spec_decoder(
+    environment: Environment, node: _XdrDeclaration
+) -> None:
+    """Emit a decoder for an XDR union's discriminant"""
+    assert isinstance(node, _XdrBasic)
+    template = get_jinja2_template(environment, "decoder", "switch_spec")
+    print(template.render(name=node.name, type=node.spec.type_name))
+
+
+def emit_union_case_spec_decoder(environment: Environment, node: _XdrCaseSpec) -> None:
+    """Emit decoder functions for an XDR union's case arm"""
+
+    if isinstance(node.arm, _XdrVoid):
+        return
+
+    template = get_jinja2_template(environment, "decoder", "case_spec")
+    for case in node.values:
+        print(template.render(case=case))
+
+    assert isinstance(node.arm, _XdrBasic)
+    template = get_jinja2_template(environment, "decoder", node.arm.template)
+    print(
+        template.render(
+            name=node.arm.name,
+            type=node.arm.spec.type_name,
+            classifier=node.arm.spec.c_classifier,
+        )
+    )
+
+    template = get_jinja2_template(environment, "decoder", "break")
+    print(template.render())
+
+
+def emit_union_default_spec_decoder(environment: Environment, node: _XdrUnion) -> None:
+    """Emit a decoder function for an XDR union's default arm"""
+    default_case = node.default
+
+    # Avoid a gcc warning about a default case with boolean discriminant
+    if default_case is None and node.discriminant.spec.type_name == "bool":
+        return
+
+    template = get_jinja2_template(environment, "decoder", "default_spec")
+    print(template.render())
+
+    if default_case is None or isinstance(default_case.arm, _XdrVoid):
+        template = get_jinja2_template(environment, "decoder", "break")
+        print(template.render())
+        return
+
+    assert isinstance(default_case.arm, _XdrBasic)
+    template = get_jinja2_template(environment, "decoder", default_case.arm.template)
+    print(
+        template.render(
+            name=default_case.arm.name,
+            type=default_case.arm.spec.type_name,
+            classifier=default_case.arm.spec.c_classifier,
+        )
+    )
+
+
+def emit_union_decoder(environment: Environment, node: _XdrUnion) -> None:
+    """Emit one XDR union decoder"""
+    template = get_jinja2_template(environment, "decoder", "open")
+    print(template.render(name=node.name))
+
+    emit_union_switch_spec_decoder(environment, node.discriminant)
+
+    for case in node.cases:
+        emit_union_case_spec_decoder(environment, case)
+
+    emit_union_default_spec_decoder(environment, node)
+
+    template = get_jinja2_template(environment, "decoder", "close")
+    print(template.render())
+
+
+def emit_union_switch_spec_encoder(
+    environment: Environment, node: _XdrDeclaration
+) -> None:
+    """Emit an encoder for an XDR union's discriminant"""
+    assert isinstance(node, _XdrBasic)
+    template = get_jinja2_template(environment, "encoder", "switch_spec")
+    print(template.render(name=node.name, type=node.spec.type_name))
+
+
+def emit_union_case_spec_encoder(environment: Environment, node: _XdrCaseSpec) -> None:
+    """Emit encoder functions for an XDR union's case arm"""
+
+    if isinstance(node.arm, _XdrVoid):
+        return
+
+    template = get_jinja2_template(environment, "encoder", "case_spec")
+    for case in node.values:
+        print(template.render(case=case))
+
+    assert isinstance(node.arm, _XdrBasic)
+    template = get_jinja2_template(environment, "encoder", node.arm.template)
+    print(
+        template.render(
+            name=node.arm.name,
+            type=node.arm.spec.type_name,
+        )
+    )
+
+    template = get_jinja2_template(environment, "encoder", "break")
+    print(template.render())
+
+
+def emit_union_default_spec_encoder(environment: Environment, node: _XdrUnion) -> None:
+    """Emit an encoder function for an XDR union's default arm"""
+    default_case = node.default
+
+    # Avoid a gcc warning about a default case with boolean discriminant
+    if default_case is None and node.discriminant.spec.type_name == "bool":
+        return
+
+    template = get_jinja2_template(environment, "encoder", "default_spec")
+    print(template.render())
+
+    if default_case is None or isinstance(default_case.arm, _XdrVoid):
+        template = get_jinja2_template(environment, "encoder", "break")
+        print(template.render())
+        return
+
+    assert isinstance(default_case.arm, _XdrBasic)
+    template = get_jinja2_template(environment, "encoder", default_case.arm.template)
+    print(
+        template.render(
+            name=default_case.arm.name,
+            type=default_case.arm.spec.type_name,
+        )
+    )
+
+
+def emit_union_encoder(environment, node: _XdrUnion) -> None:
+    """Emit one XDR union encoder"""
+    template = get_jinja2_template(environment, "encoder", "open")
+    print(template.render(name=node.name))
+
+    emit_union_switch_spec_encoder(environment, node.discriminant)
+
+    for case in node.cases:
+        emit_union_case_spec_encoder(environment, case)
+
+    emit_union_default_spec_encoder(environment, node)
+
+    template = get_jinja2_template(environment, "encoder", "close")
+    print(template.render())
+
+
+class XdrUnionGenerator(SourceGenerator):
+    """Generate source code for XDR unions"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "union")
+        self.peer = peer
+
+    def emit_declaration(self, node: _XdrUnion) -> None:
+        """Emit one declaration pair for an XDR union"""
+        emit_union_declaration(self.environment, node)
+
+    def emit_definition(self, node: _XdrUnion) -> None:
+        """Emit one definition for an XDR union"""
+        emit_union_definition(self.environment, node)
+
+    def emit_decoder(self, node: _XdrUnion) -> None:
+        """Emit one decoder function for an XDR union"""
+        emit_union_decoder(self.environment, node)
+
+    def emit_encoder(self, node: _XdrUnion) -> None:
+        """Emit one encoder function for an XDR union"""
+        emit_union_encoder(self.environment, node)
diff --git a/tools/net/sunrpc/xdrgen/grammars/xdr.lark b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
new file mode 100644
index 000000000000..f3c4552e548d
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
@@ -0,0 +1,119 @@
+// A Lark grammar for the XDR specification language based on
+// https://tools.ietf.org/html/rfc4506 Section 6.3
+
+declaration             : "opaque" identifier "[" value "]"            -> fixed_length_opaque
+                        | "opaque" identifier "<" [ value ] ">"        -> variable_length_opaque
+                        | "string" identifier "<" [ value ] ">"        -> variable_length_string
+                        | type_specifier identifier "[" value "]"      -> fixed_length_array
+                        | type_specifier identifier "<" [ value ] ">"  -> variable_length_array
+                        | type_specifier "*" identifier                -> optional_data
+                        | type_specifier identifier                    -> basic
+                        | "void"                                       -> void
+
+value                   : decimal_constant
+                        | hexadecimal_constant
+                        | octal_constant
+                        | identifier
+
+constant                : decimal_constant | hexadecimal_constant | octal_constant
+
+type_specifier          : unsigned_hyper
+                        | unsigned_long
+                        | unsigned_int
+                        | hyper
+                        | long
+                        | int
+                        | float
+                        | double
+                        | quadruple
+                        | bool
+                        | enum_type_spec
+                        | struct_type_spec
+                        | union_type_spec
+                        | identifier
+
+unsigned_hyper          : "unsigned" "hyper"
+unsigned_long           : "unsigned" "long"
+unsigned_int            : "unsigned" "int"
+hyper                   : "hyper"
+long                    : "long"
+int                     : "int"
+float                   : "float"
+double                  : "double"
+quadruple               : "quadruple"
+bool                    : "bool"
+
+enum_type_spec          : "enum" enum_body
+
+enum_body               : "{" ( identifier "=" value ) ( "," identifier "=" value )* "}"
+
+struct_type_spec        : "struct" struct_body
+
+struct_body             : "{" ( declaration ";" )+ "}"
+
+union_type_spec         : "union" union_body
+
+union_body              : switch_spec "{" case_spec+ [ default_spec ] "}"
+
+switch_spec             : "switch" "(" declaration ")"
+
+case_spec               : ( "case" value ":" )+ declaration ";"
+
+default_spec            : "default" ":" declaration ";"
+
+constant_def            : "const" identifier "=" value ";"
+
+type_def                : "typedef" declaration ";"                -> typedef
+                        | "enum" identifier enum_body ";"          -> enum
+                        | "struct" identifier struct_body ";"      -> struct
+                        | "union" identifier union_body ";"        -> union
+
+specification           : definition*
+
+definition              : constant_def
+                        | type_def
+                        | program_def
+                        | pragma_def
+
+//
+// RPC program definitions not specified in RFC 4506
+//
+
+program_def             : "program" identifier "{" version_def+ "}" "=" constant ";"
+
+version_def             : "version" identifier "{" procedure_def+ "}" "=" constant ";"
+
+procedure_def           : type_specifier identifier "(" type_specifier ")" "=" constant ";"
+
+pragma_def              : "pragma" directive identifier [ identifier ] ";"
+
+directive               : exclude_directive
+                        | header_directive
+                        | pages_directive
+                        | public_directive
+                        | skip_directive
+
+exclude_directive       : "exclude"
+header_directive        : "header"
+pages_directive         : "pages"
+public_directive        : "public"
+skip_directive          : "skip"
+
+//
+// XDR language primitives
+//
+
+identifier              : /([a-z]|[A-Z])(_|[a-z]|[A-Z]|[0-9])*/
+
+decimal_constant        : /[\+-]?(0|[1-9][0-9]*)/
+hexadecimal_constant    : /0x([a-f]|[A-F]|[0-9])+/
+octal_constant          : /0[0-7]+/
+
+PASSTHRU                : "%" | "%" /.+/
+%ignore PASSTHRU
+
+%import common.C_COMMENT
+%ignore C_COMMENT
+
+%import common.WS
+%ignore WS
diff --git a/tools/net/sunrpc/xdrgen/subcmds/__init__.py b/tools/net/sunrpc/xdrgen/subcmds/__init__.py
new file mode 100644
index 000000000000..c940e9275252
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/subcmds/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+# Just to make sphinx-apidoc document this directory
diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
new file mode 100644
index 000000000000..c5e8d79986ef
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Translate an XDR specification into executable code that
+can be compiled for the Linux kernel."""
+
+import logging
+
+from argparse import Namespace
+from lark import logger
+from lark.exceptions import UnexpectedInput
+
+from generators.constant import XdrConstantGenerator
+from generators.enum import XdrEnumGenerator
+from generators.header_bottom import XdrHeaderBottomGenerator
+from generators.header_top import XdrHeaderTopGenerator
+from generators.pointer import XdrPointerGenerator
+from generators.program import XdrProgramGenerator
+from generators.typedef import XdrTypedefGenerator
+from generators.struct import XdrStructGenerator
+from generators.union import XdrUnionGenerator
+
+from xdr_ast import transform_parse_tree, _RpcProgram, Specification
+from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer
+from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
+from xdr_parse import xdr_parser, set_xdr_annotate
+
+logger.setLevel(logging.INFO)
+
+
+def emit_header_declarations(
+    root: Specification, language: str, peer: str
+) -> None:
+    """Emit header declarations"""
+    for definition in root.definitions:
+        if isinstance(definition.value, _XdrEnum):
+            gen = XdrEnumGenerator(language, peer)
+        elif isinstance(definition.value, _XdrPointer):
+            gen = XdrPointerGenerator(language, peer)
+        elif isinstance(definition.value, _XdrTypedef):
+            gen = XdrTypedefGenerator(language, peer)
+        elif isinstance(definition.value, _XdrStruct):
+            gen = XdrStructGenerator(language, peer)
+        elif isinstance(definition.value, _XdrUnion):
+            gen = XdrUnionGenerator(language, peer)
+        elif isinstance(definition.value, _RpcProgram):
+            gen = XdrProgramGenerator(language, peer)
+        else:
+            continue
+        gen.emit_declaration(definition.value)
+
+
+def handle_parse_error(e: UnexpectedInput) -> bool:
+    """Simple parse error reporting, no recovery attempted"""
+    print(e)
+    return True
+
+
+def subcmd(args: Namespace) -> int:
+    """Generate definitions and declarations"""
+
+    set_xdr_annotate(args.annotate)
+    parser = xdr_parser()
+    with open(args.filename, encoding="utf-8") as f:
+        parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
+        ast = transform_parse_tree(parse_tree)
+
+        gen = XdrHeaderTopGenerator(args.language, args.peer)
+        gen.emit_declaration(args.filename, ast)
+
+        emit_header_declarations(ast, args.language, args.peer)
+
+        gen = XdrHeaderBottomGenerator(args.language, args.peer)
+        gen.emit_declaration(args.filename, ast)
+
+    return 0
diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
new file mode 100644
index 000000000000..5cd13d53221f
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Translate an XDR specification into executable code that
+can be compiled for the Linux kernel."""
+
+import logging
+
+from argparse import Namespace
+from lark import logger
+from lark.exceptions import UnexpectedInput
+
+from generators.constant import XdrConstantGenerator
+from generators.enum import XdrEnumGenerator
+from generators.header_bottom import XdrHeaderBottomGenerator
+from generators.header_top import XdrHeaderTopGenerator
+from generators.pointer import XdrPointerGenerator
+from generators.program import XdrProgramGenerator
+from generators.typedef import XdrTypedefGenerator
+from generators.struct import XdrStructGenerator
+from generators.union import XdrUnionGenerator
+
+from xdr_ast import transform_parse_tree, Specification
+from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer
+from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
+from xdr_parse import xdr_parser, set_xdr_annotate
+
+logger.setLevel(logging.INFO)
+
+
+def emit_header_definitions(
+    root: Specification, language: str, peer: str
+) -> None:
+    """Emit header definitions"""
+    for definition in root.definitions:
+        if isinstance(definition.value, _XdrConstant):
+            gen = XdrConstantGenerator(language, peer)
+        elif isinstance(definition.value, _XdrEnum):
+            gen = XdrEnumGenerator(language, peer)
+        elif isinstance(definition.value, _XdrPointer):
+            gen = XdrPointerGenerator(language, peer)
+        elif isinstance(definition.value, _RpcProgram):
+            gen = XdrProgramGenerator(language, peer)
+        elif isinstance(definition.value, _XdrTypedef):
+            gen = XdrTypedefGenerator(language, peer)
+        elif isinstance(definition.value, _XdrStruct):
+            gen = XdrStructGenerator(language, peer)
+        elif isinstance(definition.value, _XdrUnion):
+            gen = XdrUnionGenerator(language, peer)
+        else:
+            continue
+        gen.emit_definition(definition.value)
+
+
+def handle_parse_error(e: UnexpectedInput) -> bool:
+    """Simple parse error reporting, no recovery attempted"""
+    print(e)
+    return True
+
+
+def subcmd(args: Namespace) -> int:
+    """Generate definitions"""
+
+    set_xdr_annotate(args.annotate)
+    parser = xdr_parser()
+    with open(args.filename, encoding="utf-8") as f:
+        parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
+        ast = transform_parse_tree(parse_tree)
+
+        gen = XdrHeaderTopGenerator(args.language, args.peer)
+        gen.emit_definition(args.filename, ast)
+
+        emit_header_definitions(ast, args.language, args.peer)
+
+        gen = XdrHeaderBottomGenerator(args.language, args.peer)
+        gen.emit_definition(args.filename, ast)
+
+    return 0
diff --git a/tools/net/sunrpc/xdrgen/subcmds/lint.py b/tools/net/sunrpc/xdrgen/subcmds/lint.py
new file mode 100644
index 000000000000..36cc43717d30
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/subcmds/lint.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Translate an XDR specification into executable code that
+can be compiled for the Linux kernel."""
+
+import logging
+
+from argparse import Namespace
+from lark import logger
+from lark.exceptions import UnexpectedInput
+
+from xdr_parse import xdr_parser
+from xdr_ast import transform_parse_tree
+
+logger.setLevel(logging.DEBUG)
+
+
+def handle_parse_error(e: UnexpectedInput) -> bool:
+    """Simple parse error reporting, no recovery attempted"""
+    print(e)
+    return True
+
+
+def subcmd(args: Namespace) -> int:
+    """Lexical and syntax check of an XDR specification"""
+
+    parser = xdr_parser()
+    with open(args.filename, encoding="utf-8") as f:
+        parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
+        transform_parse_tree(parse_tree)
+
+    return 0
diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py
new file mode 100644
index 000000000000..00c04ad15b89
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/subcmds/source.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Translate an XDR specification into executable code that
+can be compiled for the Linux kernel."""
+
+import logging
+
+from argparse import Namespace
+from lark import logger
+from lark.exceptions import UnexpectedInput
+
+from generators.source_top import XdrSourceTopGenerator
+from generators.enum import XdrEnumGenerator
+from generators.pointer import XdrPointerGenerator
+from generators.program import XdrProgramGenerator
+from generators.typedef import XdrTypedefGenerator
+from generators.struct import XdrStructGenerator
+from generators.union import XdrUnionGenerator
+
+from xdr_ast import transform_parse_tree, _RpcProgram, Specification
+from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer
+from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion
+
+from xdr_parse import xdr_parser, set_xdr_annotate
+
+logger.setLevel(logging.INFO)
+
+
+def emit_source_decoder(node: _XdrAst, language: str, peer: str) -> None:
+    """Emit one XDR decoder function for a source file"""
+    if isinstance(node, _XdrEnum):
+        gen = XdrEnumGenerator(language, peer)
+    elif isinstance(node, _XdrPointer):
+        gen = XdrPointerGenerator(language, peer)
+    elif isinstance(node, _XdrTypedef):
+        gen = XdrTypedefGenerator(language, peer)
+    elif isinstance(node, _XdrStruct):
+        gen = XdrStructGenerator(language, peer)
+    elif isinstance(node, _XdrUnion):
+        gen = XdrUnionGenerator(language, peer)
+    elif isinstance(node, _RpcProgram):
+        gen = XdrProgramGenerator(language, peer)
+    else:
+        return
+    gen.emit_decoder(node)
+
+
+def emit_source_encoder(node: _XdrAst, language: str, peer: str) -> None:
+    """Emit one XDR encoder function for a source file"""
+    if isinstance(node, _XdrEnum):
+        gen = XdrEnumGenerator(language, peer)
+    elif isinstance(node, _XdrPointer):
+        gen = XdrPointerGenerator(language, peer)
+    elif isinstance(node, _XdrTypedef):
+        gen = XdrTypedefGenerator(language, peer)
+    elif isinstance(node, _XdrStruct):
+        gen = XdrStructGenerator(language, peer)
+    elif isinstance(node, _XdrUnion):
+        gen = XdrUnionGenerator(language, peer)
+    elif isinstance(node, _RpcProgram):
+        gen = XdrProgramGenerator(language, peer)
+    else:
+        return
+    gen.emit_encoder(node)
+
+
+def generate_server_source(filename: str, root: Specification, language: str) -> None:
+    """Generate server-side source code"""
+
+    gen = XdrSourceTopGenerator(language, "server")
+    gen.emit_source(filename, root)
+
+    for definition in root.definitions:
+        emit_source_decoder(definition.value, language, "server")
+    for definition in root.definitions:
+        emit_source_encoder(definition.value, language, "server")
+
+
+def generate_client_source(filename: str, root: Specification, language: str) -> None:
+    """Generate server-side source code"""
+
+    gen = XdrSourceTopGenerator(language, "client")
+    gen.emit_source(filename, root)
+
+    # cel: todo: client needs XDR size macros
+
+    for definition in root.definitions:
+        emit_source_encoder(definition.value, language, "client")
+    for definition in root.definitions:
+        emit_source_decoder(definition.value, language, "client")
+
+    # cel: todo: client needs PROC macros
+
+
+def handle_parse_error(e: UnexpectedInput) -> bool:
+    """Simple parse error reporting, no recovery attempted"""
+    print(e)
+    return True
+
+
+def subcmd(args: Namespace) -> int:
+    """Generate encoder and decoder functions"""
+
+    set_xdr_annotate(args.annotate)
+    parser = xdr_parser()
+    with open(args.filename, encoding="utf-8") as f:
+        parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
+        ast = transform_parse_tree(parse_tree)
+        match args.peer:
+            case "server":
+                generate_server_source(args.filename, ast, args.language)
+            case "client":
+                generate_client_source(args.filename, ast, args.language)
+            case _:
+                print("Code generation for", args.peer, "is not yet supported")
+
+    return 0
diff --git a/tools/net/sunrpc/xdrgen/templates/C/constants/definition.j2 b/tools/net/sunrpc/xdrgen/templates/C/constants/definition.j2
new file mode 100644
index 000000000000..d648ca4193f8
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/constants/definition.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+enum { {{ name }} = {{ value }} };
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/close.j2
new file mode 100644
index 000000000000..ab1e576c9531
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/close.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, enum {{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, enum {{ name }} value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
new file mode 100644
index 000000000000..341d829afeda
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
@@ -0,0 +1,19 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* enum {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, enum {{ name }} *ptr)
+{
+	u32 val;
+
+	if (xdr_stream_decode_u32(xdr, &val) < 0)
+		return false;
+	*ptr = val;
+	return true;
+}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
new file mode 100644
index 000000000000..9e62344a976a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/enumerator.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/enumerator.j2
new file mode 100644
index 000000000000..ff0b893b8b14
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/enumerator.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	{{ name }} = {{ value }},
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/open.j2
new file mode 100644
index 000000000000..b25335221d48
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/open.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+enum {{ name }} {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/encoder/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/encoder/enum.j2
new file mode 100644
index 000000000000..bd0a770e50f2
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/encoder/enum.j2
@@ -0,0 +1,14 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* enum {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, enum {{ name }} value)
+{
+	return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/header_bottom/declaration/header.j2 b/tools/net/sunrpc/xdrgen/templates/C/header_bottom/declaration/header.j2
new file mode 100644
index 000000000000..0bb8c6fc0c20
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/header_bottom/declaration/header.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+#endif /* _LINUX_XDRGEN_{{ infix }}_DECL_H */
diff --git a/tools/net/sunrpc/xdrgen/templates/C/header_bottom/definition/header.j2 b/tools/net/sunrpc/xdrgen/templates/C/header_bottom/definition/header.j2
new file mode 100644
index 000000000000..69069d08dc91
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/header_bottom/definition/header.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+#endif /* _LINUX_XDRGEN_{{ infix }}_DEF_H */
diff --git a/tools/net/sunrpc/xdrgen/templates/C/header_top/declaration/header.j2 b/tools/net/sunrpc/xdrgen/templates/C/header_top/declaration/header.j2
new file mode 100644
index 000000000000..ebb4e1d32f85
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/header_top/declaration/header.j2
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Generated by xdrgen. Manual edits will be lost. */
+/* XDR specification file: {{ filename }} */
+/* XDR specification modification time: {{ mtime }} */
+
+#ifndef _LINUX_XDRGEN_{{ infix }}_DECL_H
+#define _LINUX_XDRGEN_{{ infix }}_DECL_H
+
+#include <linux/types.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/xdrgen/_defs.h>
+#include <linux/sunrpc/xdrgen/_builtins.h>
+#include <linux/sunrpc/xdrgen/{{ infix.lower() }}.h>
diff --git a/tools/net/sunrpc/xdrgen/templates/C/header_top/definition/header.j2 b/tools/net/sunrpc/xdrgen/templates/C/header_top/definition/header.j2
new file mode 100644
index 000000000000..92f1fd4ba024
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/header_top/definition/header.j2
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Generated by xdrgen. Manual edits will be lost. */
+/* XDR specification file: {{ filename }} */
+/* XDR specification modification time: {{ mtime }} */
+
+#ifndef _LINUX_XDRGEN_{{ infix }}_DEF_H
+#define _LINUX_XDRGEN_{{ infix }}_DEF_H
+
+#include <linux/types.h>
+#include <linux/sunrpc/xdrgen/_defs.h>
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/declaration/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/declaration/close.j2
new file mode 100644
index 000000000000..816291184e8c
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/declaration/close.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/basic.j2
new file mode 100644
index 000000000000..cde4ab53f4be
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/basic.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (basic) */
+{% endif %}
+	if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2
new file mode 100644
index 000000000000..5bf010665f84
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/fixed_length_array.j2
new file mode 100644
index 000000000000..cfd64217ad82
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/fixed_length_array.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (fixed-length array) */
+{% endif %}
+	for (u32 i = 0; i < {{ size }}; i++) {
+		if (xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}.items[i]) < 0)
+			return false;
+	}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/fixed_length_opaque.j2
new file mode 100644
index 000000000000..b4695ece1884
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/fixed_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (fixed-length opaque) */
+{% endif %}
+	if (xdr_stream_decode_opaque_fixed(xdr, ptr->{{ name }}, {{ size }}) < 0)
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/open.j2
new file mode 100644
index 000000000000..c093d9e3c9ad
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/open.j2
@@ -0,0 +1,22 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* pointer {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr)
+{
+	bool opted;
+
+{% if annotate %}
+	/* opted */
+{% endif %}
+	if (!xdrgen_decode_bool(xdr, &opted))
+		return false;
+	if (!opted)
+		return true;
+
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/optional_data.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/optional_data.j2
new file mode 100644
index 000000000000..b6834299a04b
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/optional_data.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (optional data) */
+{% endif %}
+	if (!xdrgen_decode_{{ type }}(xdr, ptr->{{ name }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_array.j2
new file mode 100644
index 000000000000..f54ccd136762
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_array.j2
@@ -0,0 +1,13 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length array) */
+{% endif %}
+	if (xdr_stream_decode_u32(xdr, &ptr->{{ name }}.count) != XDR_UNIT)
+		return false;
+{% if maxsize != "0" %}
+	if (ptr->{{ name }}.count > {{ maxsize }})
+		return false;
+{% endif %}
+	for (u32 i = 0; i < ptr->{{ name }}.count; i++)
+		if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}.element[i]))
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_opaque.j2
new file mode 100644
index 000000000000..9a814de54ae8
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length opaque) */
+{% endif %}
+	if (!xdrgen_decode_opaque(xdr, (opaque *)ptr, {{ maxsize }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_string.j2
new file mode 100644
index 000000000000..12d20b143b43
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_string.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length string) */
+{% endif %}
+	if (!xdrgen_decode_string(xdr, (string *)ptr, {{ maxsize }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/basic.j2
new file mode 100644
index 000000000000..b3430895f311
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/basic.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (basic) */
+{% endif %}
+	{{ classifier }}{{ type }} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/close.j2
new file mode 100644
index 000000000000..9e62344a976a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/close.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/fixed_length_array.j2
new file mode 100644
index 000000000000..66be836826a0
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/fixed_length_array.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (fixed-length array) */
+{% endif %}
+	{{ type }} {{ name }}[{{ size }}];
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/fixed_length_opaque.j2
new file mode 100644
index 000000000000..0daba19aa0f0
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/fixed_length_opaque.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (fixed-length opaque) */
+{% endif %}
+	u8 {{ name }}[{{ size }}];
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/open.j2
new file mode 100644
index 000000000000..bc886b818d85
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/open.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* pointer {{ name }} */
+{% endif %}
+struct {{ name }} {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/optional_data.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/optional_data.j2
new file mode 100644
index 000000000000..a33341f45e8f
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/optional_data.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (optional data) */
+{% endif %}
+	{{ classifier }}{{ type }} *{{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_array.j2
new file mode 100644
index 000000000000..5d767f9b3674
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_array.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (variable-length array) */
+{% endif %}
+	struct {
+		u32 count;
+		{{ classifier }}{{ type }} *element;
+	} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_opaque.j2
new file mode 100644
index 000000000000..4d0cd84be3db
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_opaque.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (variable-length opaque) */
+{% endif %}
+	opaque {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_string.j2
new file mode 100644
index 000000000000..2de2feec77db
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/definition/variable_length_string.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (variable-length string) */
+{% endif %}
+	string {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/basic.j2
new file mode 100644
index 000000000000..a7d3695c5a6a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/basic.j2
@@ -0,0 +1,10 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (basic) */
+{% endif %}
+{% if type in pass_by_reference %}
+	if (!xdrgen_encode_{{ type }}(xdr, &value->{{ name }}))
+{% else %}
+	if (!xdrgen_encode_{{ type }}(xdr, value->{{ name }}))
+{% endif %}
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2
new file mode 100644
index 000000000000..5bf010665f84
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/fixed_length_array.j2
new file mode 100644
index 000000000000..b01833a2c7a1
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/fixed_length_array.j2
@@ -0,0 +1,12 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (fixed-length array) */
+{% endif %}
+	for (u32 i = 0; i < {{ size }}; i++) {
+{% if type in pass_by_reference %}
+		if (xdrgen_encode_{{ type }}(xdr, &value->items[i]) < 0)
+{% else %}
+		if (xdrgen_encode_{{ type }}(xdr, value->items[i]) < 0)
+{% endif %}
+			return false;
+	}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/fixed_length_opaque.j2
new file mode 100644
index 000000000000..07bc91919898
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/fixed_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (fixed-length opaque) */
+{% endif %}
+	if (xdr_stream_encode_opaque_fixed(xdr, value->{{ name }}, {{ size }}) < 0)
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/open.j2
new file mode 100644
index 000000000000..d67fae200261
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/open.j2
@@ -0,0 +1,20 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* pointer {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *value)
+{
+{% if annotate %}
+	/* opted */
+{% endif %}
+	if (!xdrgen_encode_bool(xdr, value != NULL))
+		return false;
+	if (!value)
+		return true;
+
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/optional_data.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/optional_data.j2
new file mode 100644
index 000000000000..16fb3e09bba1
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/optional_data.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (optional data) */
+{% endif %}
+	if (!xdrgen_encode_{{ type }}(xdr, value->{{ name }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_array.j2
new file mode 100644
index 000000000000..0ec8660d621a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_array.j2
@@ -0,0 +1,15 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length array) */
+{% endif %}
+	if (value->{{ name }}.count > {{ maxsize }})
+		return false;
+	if (xdr_stream_encode_u32(xdr, value->{{ name }}.count) != XDR_UNIT)
+		return false;
+	for (u32 i = 0; i < value->{{ name }}.count; i++)
+{% if type in pass_by_reference %}
+		if (!xdrgen_encode_{{ type }}(xdr, &value->{{ name }}.element[i]))
+{% else %}
+		if (!xdrgen_encode_{{ type }}(xdr, value->{{ name }}.element[i]))
+{% endif %}
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_opaque.j2
new file mode 100644
index 000000000000..1d477c2d197a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_opaque.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length opaque) */
+{% endif %}
+	if (value->{{ name }}.len > {{ maxsize }})
+		return false;
+	if (xdr_stream_encode_opaque(xdr, value->{{ name }}.data, value->{{ name }}.len) < 0)
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_string.j2
new file mode 100644
index 000000000000..cf65b71eaef3
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/variable_length_string.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length string) */
+{% endif %}
+	if (value->{{ name }}.len > {{ maxsize }})
+		return false;
+	if (xdr_stream_encode_opaque(xdr, value->{{ name }}.data, value->{{ name }}.len) < 0)
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/declaration/argument.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/declaration/argument.j2
new file mode 100644
index 000000000000..4364fed19162
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/declaration/argument.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+bool {{ program }}_svc_decode_{{ argument }}(struct svc_rqst *rqstp, struct xdr_stream *xdr);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/declaration/result.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/declaration/result.j2
new file mode 100644
index 000000000000..e0ea1e849910
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/declaration/result.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+bool {{ program }}_svc_encode_{{ result }}(struct svc_rqst *rqstp, struct xdr_stream *xdr);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
new file mode 100644
index 000000000000..0b1709cca0d4
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
@@ -0,0 +1,21 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+/**
+ * {{ program }}_svc_decode_{{ argument }} - Decode a {{ argument }} argument
+ * @rqstp: RPC transaction context
+ * @xdr: source XDR data stream
+ *
+ * Return values:
+ *   %true: procedure arguments decoded successfully
+ *   %false: decode failed
+ */
+bool {{ program }}_svc_decode_{{ argument }}(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+{
+{% if argument == 'void' %}
+	return xdrgen_decode_void(xdr);
+{% else %}
+	struct {{ argument }} *argp = rqstp->rq_argp;
+
+	return xdrgen_decode_{{ argument }}(xdr, argp);
+{% endif %}
+}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/result.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/result.j2
new file mode 100644
index 000000000000..d304eccb5c40
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/result.j2
@@ -0,0 +1,22 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* Decode {{ result }} results */
+{% endif %}
+static int {{ program }}_xdr_dec_{{ result }}(struct rpc_rqst *req,
+		struct xdr_stream *xdr, void *data)
+{
+{% if result == 'void' %}
+	xdrgen_decode_void(xdr);
+{% else %}
+	struct {{ result }} *result = data;
+
+	if (!xdrgen_decode_{{ result }}(xdr, result))
+		return -EIO;
+	if (result->stat != nfs_ok) {
+		trace_nfs_xdr_status(xdr, (int)result->stat);
+		return {{ program }}_stat_to_errno(result->stat);
+	}
+{% endif %}
+	return 0;
+}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/definition/close.j2
new file mode 100644
index 000000000000..9e62344a976a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/definition/close.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/definition/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/definition/open.j2
new file mode 100644
index 000000000000..f9a6d439f156
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/definition/open.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* procedure numbers for {{ program }} */
+{% endif %}
+enum {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/definition/procedure.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/definition/procedure.j2
new file mode 100644
index 000000000000..ff0b893b8b14
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/definition/procedure.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	{{ name }} = {{ value }},
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/argument.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/argument.j2
new file mode 100644
index 000000000000..2fbb5bd13aec
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/argument.j2
@@ -0,0 +1,16 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{%if annotate %}
+/* Encode {{ argument }} arguments */
+{% endif %}
+static void {{ program }}_xdr_enc_{{ argument }}(struct rpc_rqst *req,
+		struct xdr_stream *xdr, const void *data)
+{
+{% if argument == 'void' %}
+	xdrgen_encode_void(xdr);
+{% else %}
+	const struct {{ argument }} *args = data;
+
+	xdrgen_encode_{{ argument }}(xdr, args);
+{% endif %}
+}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
new file mode 100644
index 000000000000..6fc61a5d47b7
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
@@ -0,0 +1,21 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+/**
+ * {{ program }}_svc_encode_{{ result }} - Encode a {{ result }} result
+ * @rqstp: RPC transaction context
+ * @xdr: target XDR data stream
+ *
+ * Return values:
+ *   %true: procedure results encoded successfully
+ *   %false: encode failed
+ */
+bool {{ program }}_svc_encode_{{ result }}(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+{
+{% if result == 'void' %}
+	return xdrgen_encode_void(xdr);
+{% else %}
+	struct {{ result }} *resp = rqstp->rq_resp;
+
+	return xdrgen_encode_{{ result }}(xdr, resp);
+{% endif %}
+}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
new file mode 100644
index 000000000000..e3a802cbc4d7
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+// Generated by xdrgen. Manual edits will be lost.
+// XDR specification file: {{ filename }}
+// XDR specification modification time: {{ mtime }}
+
+#include <linux/sunrpc/xprt.h>
+
+#include "{{ program }}xdr_gen.h"
diff --git a/tools/net/sunrpc/xdrgen/templates/C/source_top/server.j2 b/tools/net/sunrpc/xdrgen/templates/C/source_top/server.j2
new file mode 100644
index 000000000000..974e1d971e5d
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/source_top/server.j2
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+// Generated by xdrgen. Manual edits will be lost.
+// XDR specification file: {{ filename }}
+// XDR specification modification time: {{ mtime }}
+
+#include <linux/sunrpc/svc.h>
+
+#include "{{ program }}xdr_gen.h"
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/declaration/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/declaration/close.j2
new file mode 100644
index 000000000000..816291184e8c
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/declaration/close.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/basic.j2
new file mode 100644
index 000000000000..cde4ab53f4be
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/basic.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (basic) */
+{% endif %}
+	if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2
new file mode 100644
index 000000000000..5bf010665f84
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/fixed_length_array.j2
new file mode 100644
index 000000000000..cfd64217ad82
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/fixed_length_array.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (fixed-length array) */
+{% endif %}
+	for (u32 i = 0; i < {{ size }}; i++) {
+		if (xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}.items[i]) < 0)
+			return false;
+	}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/fixed_length_opaque.j2
new file mode 100644
index 000000000000..b4695ece1884
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/fixed_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (fixed-length opaque) */
+{% endif %}
+	if (xdr_stream_decode_opaque_fixed(xdr, ptr->{{ name }}, {{ size }}) < 0)
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/open.j2
new file mode 100644
index 000000000000..289e67259f55
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/open.j2
@@ -0,0 +1,12 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* struct {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr)
+{
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/optional_data.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/optional_data.j2
new file mode 100644
index 000000000000..b6834299a04b
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/optional_data.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (optional data) */
+{% endif %}
+	if (!xdrgen_decode_{{ type }}(xdr, ptr->{{ name }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2
new file mode 100644
index 000000000000..f54ccd136762
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2
@@ -0,0 +1,13 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length array) */
+{% endif %}
+	if (xdr_stream_decode_u32(xdr, &ptr->{{ name }}.count) != XDR_UNIT)
+		return false;
+{% if maxsize != "0" %}
+	if (ptr->{{ name }}.count > {{ maxsize }})
+		return false;
+{% endif %}
+	for (u32 i = 0; i < ptr->{{ name }}.count; i++)
+		if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}.element[i]))
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2
new file mode 100644
index 000000000000..9a814de54ae8
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length opaque) */
+{% endif %}
+	if (!xdrgen_decode_opaque(xdr, (opaque *)ptr, {{ maxsize }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_string.j2
new file mode 100644
index 000000000000..12d20b143b43
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_string.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length string) */
+{% endif %}
+	if (!xdrgen_decode_string(xdr, (string *)ptr, {{ maxsize }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/basic.j2
new file mode 100644
index 000000000000..b3430895f311
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/basic.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (basic) */
+{% endif %}
+	{{ classifier }}{{ type }} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/close.j2
new file mode 100644
index 000000000000..9e62344a976a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/close.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/fixed_length_array.j2
new file mode 100644
index 000000000000..66be836826a0
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/fixed_length_array.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (fixed-length array) */
+{% endif %}
+	{{ type }} {{ name }}[{{ size }}];
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/fixed_length_opaque.j2
new file mode 100644
index 000000000000..0daba19aa0f0
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/fixed_length_opaque.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (fixed-length opaque) */
+{% endif %}
+	u8 {{ name }}[{{ size }}];
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/open.j2
new file mode 100644
index 000000000000..07cbf5424546
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/open.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* struct {{ name }} */
+{% endif %}
+struct {{ name }} {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/optional_data.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/optional_data.j2
new file mode 100644
index 000000000000..a33341f45e8f
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/optional_data.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (optional data) */
+{% endif %}
+	{{ classifier }}{{ type }} *{{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_array.j2
new file mode 100644
index 000000000000..5d767f9b3674
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_array.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (variable-length array) */
+{% endif %}
+	struct {
+		u32 count;
+		{{ classifier }}{{ type }} *element;
+	} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_opaque.j2
new file mode 100644
index 000000000000..4d0cd84be3db
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_opaque.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (variable-length opaque) */
+{% endif %}
+	opaque {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_string.j2
new file mode 100644
index 000000000000..2de2feec77db
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/definition/variable_length_string.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* (variable-length string) */
+{% endif %}
+	string {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/basic.j2
new file mode 100644
index 000000000000..a7d3695c5a6a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/basic.j2
@@ -0,0 +1,10 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (basic) */
+{% endif %}
+{% if type in pass_by_reference %}
+	if (!xdrgen_encode_{{ type }}(xdr, &value->{{ name }}))
+{% else %}
+	if (!xdrgen_encode_{{ type }}(xdr, value->{{ name }}))
+{% endif %}
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2
new file mode 100644
index 000000000000..5bf010665f84
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/fixed_length_array.j2
new file mode 100644
index 000000000000..b01833a2c7a1
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/fixed_length_array.j2
@@ -0,0 +1,12 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (fixed-length array) */
+{% endif %}
+	for (u32 i = 0; i < {{ size }}; i++) {
+{% if type in pass_by_reference %}
+		if (xdrgen_encode_{{ type }}(xdr, &value->items[i]) < 0)
+{% else %}
+		if (xdrgen_encode_{{ type }}(xdr, value->items[i]) < 0)
+{% endif %}
+			return false;
+	}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/fixed_length_opaque.j2
new file mode 100644
index 000000000000..07bc91919898
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/fixed_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (fixed-length opaque) */
+{% endif %}
+	if (xdr_stream_encode_opaque_fixed(xdr, value->{{ name }}, {{ size }}) < 0)
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/open.j2
new file mode 100644
index 000000000000..2286a3adf82a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/open.j2
@@ -0,0 +1,12 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* struct {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *value)
+{
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/optional_data.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/optional_data.j2
new file mode 100644
index 000000000000..16fb3e09bba1
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/optional_data.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (optional data) */
+{% endif %}
+	if (!xdrgen_encode_{{ type }}(xdr, value->{{ name }}))
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_array.j2
new file mode 100644
index 000000000000..0ec8660d621a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_array.j2
@@ -0,0 +1,15 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length array) */
+{% endif %}
+	if (value->{{ name }}.count > {{ maxsize }})
+		return false;
+	if (xdr_stream_encode_u32(xdr, value->{{ name }}.count) != XDR_UNIT)
+		return false;
+	for (u32 i = 0; i < value->{{ name }}.count; i++)
+{% if type in pass_by_reference %}
+		if (!xdrgen_encode_{{ type }}(xdr, &value->{{ name }}.element[i]))
+{% else %}
+		if (!xdrgen_encode_{{ type }}(xdr, value->{{ name }}.element[i]))
+{% endif %}
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_opaque.j2
new file mode 100644
index 000000000000..1d477c2d197a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_opaque.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length opaque) */
+{% endif %}
+	if (value->{{ name }}.len > {{ maxsize }})
+		return false;
+	if (xdr_stream_encode_opaque(xdr, value->{{ name }}.data, value->{{ name }}.len) < 0)
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_string.j2
new file mode 100644
index 000000000000..cf65b71eaef3
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/variable_length_string.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* member {{ name }} (variable-length string) */
+{% endif %}
+	if (value->{{ name }}.len > {{ maxsize }})
+		return false;
+	if (xdr_stream_encode_opaque(xdr, value->{{ name }}.data, value->{{ name }}.len) < 0)
+		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/basic.j2
new file mode 100644
index 000000000000..455b10bd90ec
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/basic.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr);
+{% if name in pass_by_reference %}
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ name }} *value);
+{%- else -%}
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ name }} value);
+{% endif %}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/fixed_length_array.j2
new file mode 100644
index 000000000000..3fe3ddd9f359
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/fixed_length_array.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/fixed_length_opaque.j2
new file mode 100644
index 000000000000..3fe3ddd9f359
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/fixed_length_opaque.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_array.j2
new file mode 100644
index 000000000000..3fe3ddd9f359
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_array.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_opaque.j2
new file mode 100644
index 000000000000..3fe3ddd9f359
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_opaque.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_string.j2
new file mode 100644
index 000000000000..3fe3ddd9f359
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/declaration/variable_length_string.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2
new file mode 100644
index 000000000000..da4709403dc9
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2
@@ -0,0 +1,17 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr)
+{
+{% if annotate %}
+	/* (basic) */
+{% endif %}
+	return xdrgen_decode_{{ type }}(xdr, ptr);
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2
new file mode 100644
index 000000000000..d7c80e472fe3
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2
@@ -0,0 +1,25 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr)
+{
+{% if annotate %}
+	/* (fixed-length array) */
+{% endif %}
+	for (u32 i = 0; i < {{ size }}; i++) {
+{%- if classifier == '' %}
+		if (xdrgen_decode_{{ type }}(xdr, ptr->items[i]) < 0)
+{% else %}
+		if (xdrgen_decode_{{ type }}(xdr, &ptr->items[i]) < 0)
+{% endif %}
+			return false;
+	}
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2
new file mode 100644
index 000000000000..8b4ff08c49e5
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2
@@ -0,0 +1,17 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr)
+{
+{% if annotate %}
+	/* (fixed-length opaque) */
+{% endif %}
+	return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) >= 0;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2
new file mode 100644
index 000000000000..e74ffdd98463
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2
@@ -0,0 +1,26 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr)
+{
+{% if annotate %}
+	/* (variable-length array) */
+{% endif %}
+	if (xdr_stream_decode_u32(xdr, &ptr->count) < 0)
+		return false;
+{% if maxsize != "0" %}
+	if (ptr->count > {{ maxsize }})
+		return false;
+{% endif %}
+	for (u32 i = 0; i < ptr->count; i++)
+		if (!xdrgen_decode_{{ type }}(xdr, &ptr->element[i]))
+			return false;
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2
new file mode 100644
index 000000000000..c1b7ad84f99c
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2
@@ -0,0 +1,17 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr)
+{
+{% if annotate %}
+	/* (variable-length opaque) */
+{% endif %}
+	return xdr_stream_decode_opaque(xdr, ptr->data, ptr->len) >= 0;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2
new file mode 100644
index 000000000000..937286d76688
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2
@@ -0,0 +1,17 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr)
+{
+{% if annotate %}
+	/* (variable-length string) */
+{% endif %}
+	return xdr_stream_decode_opaque(xdr, ptr->data, ptr->len) >= 0;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/basic.j2
new file mode 100644
index 000000000000..1c5f28135eec
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/basic.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} (basic) */
+{% endif %}
+typedef {{ classifier }}{{ type }} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/fixed_length_array.j2
new file mode 100644
index 000000000000..c3a67c952e77
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/fixed_length_array.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} (fixed-length array) */
+{% endif %}
+typedef {{ type }}{{ name }}[{{ size }}];
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/fixed_length_opaque.j2
new file mode 100644
index 000000000000..8788b02fe4f5
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/fixed_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} (fixed-length opaque) */
+{% endif %}
+typedef u8 {{ name }}[{{ size }}];
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_array.j2
new file mode 100644
index 000000000000..f03393760545
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_array.j2
@@ -0,0 +1,9 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} (variable-length array) */
+{% endif %}
+typedef struct {
+	u32 count;
+	{{ classifier }}{{ type }} *element;
+} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_opaque.j2
new file mode 100644
index 000000000000..162f2610af34
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} (variable-length opaque) */
+{% endif %}
+typedef opaque {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_string.j2
new file mode 100644
index 000000000000..c03c2df8e625
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/definition/variable_length_string.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} (variable-length string) */
+{% endif %}
+typedef string {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2
new file mode 100644
index 000000000000..35effe67e4ef
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2
@@ -0,0 +1,21 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+{% if name in pass_by_reference %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} *value)
+{% else %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value)
+{% endif %}
+{
+{% if annotate %}
+	/* (basic) */
+{% endif %}
+	return xdrgen_encode_{{ type }}(xdr, value);
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2
new file mode 100644
index 000000000000..95202ad5ad2d
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2
@@ -0,0 +1,25 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value)
+{
+{% if annotate %}
+	/* (fixed-length array) */
+{% endif %}
+	for (u32 i = 0; i < {{ size }}; i++) {
+{% if type in pass_by_reference %}
+		if (xdrgen_encode_{{ type }}(xdr, &value->items[i]) < 0)
+{% else %}
+		if (xdrgen_encode_{{ type }}(xdr, value->items[i]) < 0)
+{% endif %}
+			return false;
+	}
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2
new file mode 100644
index 000000000000..9c66a11b9912
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2
@@ -0,0 +1,17 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value)
+{
+{% if annotate %}
+	/* (fixed-length opaque) */
+{% endif %}
+	return xdr_stream_encode_opaque_fixed(xdr, value, {{ size }}) >= 0;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2
new file mode 100644
index 000000000000..2d2384f64918
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2
@@ -0,0 +1,30 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value)
+{
+{% if annotate %}
+	/* (variable-length array) */
+{% endif %}
+{% if maxsize != "0" %}
+	if (unlikely(value.count > {{ maxsize }}))
+		return false;
+{% endif %}
+	if (xdr_stream_encode_u32(xdr, value.count) != XDR_UNIT)
+		return false;
+	for (u32 i = 0; i < value.count; i++)
+{% if type in pass_by_reference %}
+		if (!xdrgen_encode_{{ type }}(xdr, &value.element[i]))
+{% else %}
+		if (!xdrgen_encode_{{ type }}(xdr, value.element[i]))
+{% endif %}
+			return false;
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2
new file mode 100644
index 000000000000..8508f13c95b9
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2
@@ -0,0 +1,17 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value)
+{
+{% if annotate %}
+	/* (variable-length opaque) */
+{% endif %}
+	return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_string.j2
new file mode 100644
index 000000000000..3d490ff180d0
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_string.j2
@@ -0,0 +1,17 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* typedef {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name }} value)
+{
+{% if annotate %}
+	/* (variable-length string) */
+{% endif %}
+	return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/basic.j2
new file mode 100644
index 000000000000..4d97cc5395eb
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/basic.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+		/* member {{ name }} (basic) */
+{% endif %}
+		if (!xdrgen_decode_{{ type }}(xdr, &ptr->u.{{ name }}))
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/break.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/break.j2
new file mode 100644
index 000000000000..b286d1407029
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/break.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+		break;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/case_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/case_spec.j2
new file mode 100644
index 000000000000..5fa2163f0a74
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/case_spec.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	case {{ case }}:
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2
new file mode 100644
index 000000000000..fdc2dfd1843b
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	}
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/default_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/default_spec.j2
new file mode 100644
index 000000000000..044a002d0589
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/default_spec.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	default:
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/open.j2
new file mode 100644
index 000000000000..eb9941376e49
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/open.j2
@@ -0,0 +1,12 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* union {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr)
+{
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/optional_data.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/optional_data.j2
new file mode 100644
index 000000000000..e4476f5fd8d3
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/optional_data.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+		/* member {{ name }} (optional data) */
+{% endif %}
+		if (!xdrgen_decode_{{ type }}(xdr, &ptr->u.{{ name }}))
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/switch_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/switch_spec.j2
new file mode 100644
index 000000000000..99b3067ef617
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/switch_spec.j2
@@ -0,0 +1,7 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* discriminant {{ name }} */
+{% endif %}
+	if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}))
+		return false;
+	switch (ptr->{{ name }}) {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2
new file mode 100644
index 000000000000..eee2b9a68e27
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2
@@ -0,0 +1,13 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+		/* member {{ name }} (variable-length array) */
+{% endif %}
+		if (xdr_stream_decode_u32(xdr, &count) != XDR_UNIT)
+			return false;
+		if (count > {{ maxsize }})
+			return false;
+		for (u32 i = 0; i < count; i++) {
+			if (xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}.items[i]) < 0)
+				return false;
+		}
+		ptr->{{ name }}.len = count;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_opaque.j2
new file mode 100644
index 000000000000..c9d88ed29c78
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_opaque.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+		/* member {{ name }} (variable-length opaque) */
+{% endif %}
+		if (!xdrgen_decode_opaque(xdr, (struct opaque *)ptr->u.{{ name }}, {{ maxsize }}))
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_string.j2
new file mode 100644
index 000000000000..83b6e5a14e7f
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_string.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+		/* member {{ name }} (variable-length string) */
+{% endif %}
+		if (!xdrgen_decode_string(xdr, (struct string *)ptr->u.{{ name }}, {{ maxsize }}))
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/void.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/void.j2
new file mode 100644
index 000000000000..65205ce37b36
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/void.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+		if (!xdrgen_decode_void(xdr))
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/case_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/case_spec.j2
new file mode 100644
index 000000000000..52f8d131b805
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/case_spec.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+		{{ classifier }}{{ type }} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
new file mode 100644
index 000000000000..01d716d0099e
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
@@ -0,0 +1,8 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	} u;
+};
+{%- if name in public_apis %}
+
+bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr);
+bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *ptr);
+{%- endif -%}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/default_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/default_spec.j2
new file mode 100644
index 000000000000..52f8d131b805
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/default_spec.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+		{{ classifier }}{{ type }} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/open.j2
new file mode 100644
index 000000000000..20fcfd1fc4e5
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/open.j2
@@ -0,0 +1,6 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* union {{ name }} */
+{% endif %}
+struct {{ name }} {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/switch_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/switch_spec.j2
new file mode 100644
index 000000000000..3e552732502c
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/switch_spec.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	{{ classifier }}{{ type }} {{ name }};
+	union {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/basic.j2
new file mode 100644
index 000000000000..6452d75c6f9a
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/basic.j2
@@ -0,0 +1,10 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+		/* member {{ name }} (basic) */
+{% endif %}
+{% if type in pass_by_reference %}
+		if (!xdrgen_encode_{{ type }}(xdr, &ptr->u.{{ name }}))
+{% else %}
+		if (!xdrgen_encode_{{ type }}(xdr, ptr->u.{{ name }}))
+{% endif %}
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/break.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/break.j2
new file mode 100644
index 000000000000..b286d1407029
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/break.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+		break;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/case_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/case_spec.j2
new file mode 100644
index 000000000000..5fa2163f0a74
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/case_spec.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	case {{ case }}:
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2
new file mode 100644
index 000000000000..fdc2dfd1843b
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2
@@ -0,0 +1,4 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	}
+	return true;
+};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/default_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/default_spec.j2
new file mode 100644
index 000000000000..044a002d0589
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/default_spec.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	default:
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/open.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/open.j2
new file mode 100644
index 000000000000..e5a206df10c6
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/open.j2
@@ -0,0 +1,12 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* union {{ name }} */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *ptr)
+{
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/switch_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/switch_spec.j2
new file mode 100644
index 000000000000..c8c3ecbe038b
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/switch_spec.j2
@@ -0,0 +1,7 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* discriminant {{ name }} */
+{% endif %}
+	if (!xdrgen_encode_{{ type }}(xdr, ptr->{{ name }}))
+		return false;
+	switch (ptr->{{ name }}) {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/void.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/void.j2
new file mode 100644
index 000000000000..84e7c2127d75
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/void.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+		if (!xdrgen_encode_void(xdr))
+			return false;
diff --git a/tools/net/sunrpc/xdrgen/tests/test.x b/tools/net/sunrpc/xdrgen/tests/test.x
new file mode 100644
index 000000000000..90c8587f6fe5
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/tests/test.x
@@ -0,0 +1,36 @@
+/* Sample XDR specification from RFC 1832 Section 5.5 */
+
+const MAXUSERNAME = 32;     /* max length of a user name */
+const MAXFILELEN = 65535;   /* max length of a file      */
+const MAXNAMELEN = 255;     /* max length of a file name */
+
+/*
+ * Types of files:
+ */
+enum filekind {
+   TEXT = 0,       /* ascii data */
+   DATA = 1,       /* raw data   */
+   EXEC = 2        /* executable */
+};
+
+/*
+ * File information, per kind of file:
+ */
+union filetype switch (filekind kind) {
+case TEXT:
+   void;                           /* no extra information */
+case DATA:
+   string creator<MAXNAMELEN>;     /* data creator         */
+case EXEC:
+   string interpretor<MAXNAMELEN>; /* program interpretor  */
+};
+
+/*
+ * A complete file:
+ */
+struct file {
+   string filename<MAXNAMELEN>; /* name of file    */
+   filetype type;               /* info about file */
+   string owner<MAXUSERNAME>;   /* owner of file   */
+   opaque data<MAXFILELEN>;     /* file data       */
+};
diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py
new file mode 100644
index 000000000000..dbd3fcf9c957
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/xdr_ast.py
@@ -0,0 +1,510 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Define and implement the Abstract Syntax Tree for the XDR language."""
+
+import sys
+from typing import List
+from dataclasses import dataclass
+
+from lark import ast_utils, Transformer
+from lark.tree import Meta
+
+this_module = sys.modules[__name__]
+
+excluded_apis = []
+header_name = "none"
+public_apis = []
+enums = set()
+structs = set()
+pass_by_reference = set()
+
+
+@dataclass
+class _XdrAst(ast_utils.Ast):
+    """Base class for the XDR abstract syntax tree"""
+
+
+@dataclass
+class _XdrIdentifier(_XdrAst):
+    """Corresponds to 'identifier' in the XDR language grammar"""
+
+    symbol: str
+
+
+@dataclass
+class _XdrValue(_XdrAst):
+    """Corresponds to 'value' in the XDR language grammar"""
+
+    value: str
+
+
+@dataclass
+class _XdrConstantValue(_XdrAst):
+    """Corresponds to 'constant' in the XDR language grammar"""
+
+    value: int
+
+
+@dataclass
+class _XdrTypeSpecifier(_XdrAst):
+    """Corresponds to 'type_specifier' in the XDR language grammar"""
+
+    type_name: str
+    c_classifier: str
+
+
+@dataclass
+class _XdrDefinedType(_XdrTypeSpecifier):
+    """Corresponds to a type defined by the input specification"""
+
+
+@dataclass
+class _XdrBuiltInType(_XdrTypeSpecifier):
+    """Corresponds to a built-in XDR type"""
+
+
+@dataclass
+class _XdrDeclaration(_XdrAst):
+    """Base class of XDR type declarations"""
+
+
+@dataclass
+class _XdrFixedLengthOpaque(_XdrDeclaration):
+    """A fixed-length opaque declaration"""
+
+    name: str
+    size: str
+    template: str = "fixed_length_opaque"
+
+
+@dataclass
+class _XdrVariableLengthOpaque(_XdrDeclaration):
+    """A variable-length opaque declaration"""
+
+    name: str
+    maxsize: str
+    template: str = "variable_length_opaque"
+
+
+@dataclass
+class _XdrVariableLengthString(_XdrDeclaration):
+    """A (NUL-terminated) variable-length string declaration"""
+
+    name: str
+    maxsize: str
+    template: str = "variable_length_string"
+
+
+@dataclass
+class _XdrFixedLengthArray(_XdrDeclaration):
+    """A fixed-length array declaration"""
+
+    name: str
+    spec: _XdrTypeSpecifier
+    size: str
+    template: str = "fixed_length_array"
+
+
+@dataclass
+class _XdrVariableLengthArray(_XdrDeclaration):
+    """A variable-length array declaration"""
+
+    name: str
+    spec: _XdrTypeSpecifier
+    maxsize: str
+    template: str = "variable_length_array"
+
+
+@dataclass
+class _XdrOptionalData(_XdrDeclaration):
+    """An 'optional_data' declaration"""
+
+    name: str
+    spec: _XdrTypeSpecifier
+    template: str = "optional_data"
+
+
+@dataclass
+class _XdrBasic(_XdrDeclaration):
+    """A 'basic' declaration"""
+
+    name: str
+    spec: _XdrTypeSpecifier
+    template: str = "basic"
+
+
+@dataclass
+class _XdrVoid(_XdrDeclaration):
+    """A void declaration"""
+
+    template: str = "void"
+
+
+@dataclass
+class _XdrConstant(_XdrAst):
+    """Corresponds to 'constant_def' in the grammar"""
+
+    name: str
+    value: str
+
+
+@dataclass
+class _XdrEnumerator(_XdrAst):
+    """An 'identifier = value' enumerator"""
+
+    name: str
+    value: str
+
+
+@dataclass
+class _XdrEnum(_XdrAst):
+    """An XDR enum definition"""
+
+    name: str
+    minimum: int
+    maximum: int
+    enumerators: List[_XdrEnumerator]
+
+
+@dataclass
+class _XdrStruct(_XdrAst):
+    """An XDR struct definition"""
+
+    name: str
+    fields: List[_XdrDeclaration]
+
+
+@dataclass
+class _XdrPointer(_XdrAst):
+    """An XDR pointer definition"""
+
+    name: str
+    fields: List[_XdrDeclaration]
+
+
+@dataclass
+class _XdrTypedef(_XdrAst):
+    """An XDR typedef"""
+
+    declaration: _XdrDeclaration
+
+
+@dataclass
+class _XdrCaseSpec(_XdrAst):
+    """One case in an XDR union"""
+
+    values: List[str]
+    arm: _XdrDeclaration
+    template: str = "case_spec"
+
+
+@dataclass
+class _XdrDefaultSpec(_XdrAst):
+    """Default case in an XDR union"""
+
+    arm: _XdrDeclaration
+    template: str = "default_spec"
+
+
+@dataclass
+class _XdrUnion(_XdrAst):
+    """An XDR union"""
+
+    name: str
+    discriminant: _XdrDeclaration
+    cases: List[_XdrCaseSpec]
+    default: _XdrDeclaration
+
+
+@dataclass
+class _RpcProcedure(_XdrAst):
+    """RPC procedure definition"""
+
+    name: str
+    number: str
+    argument: _XdrTypeSpecifier
+    result: _XdrTypeSpecifier
+
+
+@dataclass
+class _RpcVersion(_XdrAst):
+    """RPC version definition"""
+
+    name: str
+    number: str
+    procedures: List[_RpcProcedure]
+
+
+@dataclass
+class _RpcProgram(_XdrAst):
+    """RPC program definition"""
+
+    name: str
+    number: str
+    versions: List[_RpcVersion]
+
+
+@dataclass
+class _Pragma(_XdrAst):
+    """Empty class for pragma directives"""
+
+
+@dataclass
+class Definition(_XdrAst, ast_utils.WithMeta):
+    """Corresponds to 'definition' in the grammar"""
+
+    meta: Meta
+    value: _XdrAst
+
+
+@dataclass
+class Specification(_XdrAst, ast_utils.AsList):
+    """Corresponds to 'specification' in the grammar"""
+
+    definitions: List[Definition]
+
+
+class ParseToAst(Transformer):
+    """Functions that transform productions into AST nodes"""
+
+    def identifier(self, children):
+        """Instantiate one _XdrIdentifier object"""
+        return _XdrIdentifier(children[0].value)
+
+    def value(self, children):
+        """Instantiate one _XdrValue object"""
+        if isinstance(children[0], _XdrIdentifier):
+            return _XdrValue(children[0].symbol)
+        return _XdrValue(children[0].children[0].value)
+
+    def constant(self, children):
+        """Instantiate one _XdrConstantValue object"""
+        match children[0].data:
+            case "decimal_constant":
+                value = int(children[0].children[0].value, base=10)
+            case "hexadecimal_constant":
+                value = int(children[0].children[0].value, base=16)
+            case "octal_constant":
+                value = int(children[0].children[0].value, base=8)
+        return _XdrConstantValue(value)
+
+    def type_specifier(self, children):
+        """Instantiate one type_specifier object"""
+        c_classifier = ""
+        if isinstance(children[0], _XdrIdentifier):
+            name = children[0].symbol
+            if name in enums:
+                c_classifier = "enum "
+            if name in structs:
+                c_classifier = "struct "
+            return _XdrDefinedType(
+                type_name=name,
+                c_classifier=c_classifier,
+            )
+
+        token = children[0].data
+        return _XdrBuiltInType(
+            type_name=token.value,
+            c_classifier=c_classifier,
+        )
+
+    def constant_def(self, children):
+        """Instantiate one _XdrConstant object"""
+        name = children[0].symbol
+        value = children[1].value
+        return _XdrConstant(name, value)
+
+    # cel: Python can compute a min() and max() for the enumerator values
+    #      so that the generated code can perform proper range checking.
+    def enum(self, children):
+        """Instantiate one _XdrEnum object"""
+        enum_name = children[0].symbol
+        enums.add(enum_name)
+
+        i = 0
+        enumerators = []
+        body = children[1]
+        while i < len(body.children):
+            name = body.children[i].symbol
+            value = body.children[i + 1].value
+            enumerators.append(_XdrEnumerator(name, value))
+            i = i + 2
+
+        return _XdrEnum(enum_name, 0, 0, enumerators)
+
+    def fixed_length_opaque(self, children):
+        """Instantiate one _XdrFixedLengthOpaque declaration object"""
+        name = children[0].symbol
+        size = children[1].value
+
+        return _XdrFixedLengthOpaque(name, size)
+
+    def variable_length_opaque(self, children):
+        """Instantiate one _XdrVariableLengthOpaque declaration object"""
+        name = children[0].symbol
+        if children[1] is not None:
+            maxsize = children[1].value
+        else:
+            maxsize = "0"
+
+        return _XdrVariableLengthOpaque(name, maxsize)
+
+    def variable_length_string(self, children):
+        """Instantiate one _XdrVariableLengthString declaration object"""
+        name = children[0].symbol
+        if children[1] is not None:
+            maxsize = children[1].value
+        else:
+            maxsize = "0"
+
+        return _XdrVariableLengthString(name, maxsize)
+
+    def fixed_length_array(self, children):
+        """Instantiate one _XdrFixedLengthArray declaration object"""
+        spec = children[0]
+        name = children[1].symbol
+        size = children[2].value
+
+        return _XdrFixedLengthArray(name, spec, size)
+
+    def variable_length_array(self, children):
+        """Instantiate one _XdrVariableLengthArray declaration object"""
+        spec = children[0]
+        name = children[1].symbol
+        if children[2] is not None:
+            maxsize = children[2].value
+        else:
+            maxsize = "0"
+
+        return _XdrVariableLengthArray(name, spec, maxsize)
+
+    def optional_data(self, children):
+        """Instantiate one _XdrOptionalData declaration object"""
+        spec = children[0]
+        name = children[1].symbol
+        structs.add(name)
+        pass_by_reference.add(name)
+
+        return _XdrOptionalData(name, spec)
+
+    def basic(self, children):
+        """Instantiate one _XdrBasic object"""
+        spec = children[0]
+        name = children[1].symbol
+
+        return _XdrBasic(name, spec)
+
+    def void(self, children):
+        """Instantiate one _XdrVoid declaration object"""
+
+        return _XdrVoid()
+
+    def struct(self, children):
+        """Instantiate one _XdrStruct object"""
+        name = children[0].symbol
+        structs.add(name)
+        pass_by_reference.add(name)
+        fields = children[1].children
+
+        last_field = fields[-1]
+        if (
+            isinstance(last_field, _XdrOptionalData)
+            and name == last_field.spec.type_name
+        ):
+            return _XdrPointer(name, fields)
+
+        return _XdrStruct(name, fields)
+
+    def typedef(self, children):
+        """Instantiate one _XdrTypedef object"""
+        new_type = children[0]
+        if isinstance(new_type, _XdrBasic) and isinstance(
+            new_type.spec, _XdrDefinedType
+        ):
+            if new_type.spec.type_name in pass_by_reference:
+                pass_by_reference.add(new_type.name)
+
+        return _XdrTypedef(new_type)
+
+    def case_spec(self, children):
+        """Instantiate one _XdrCaseSpec object"""
+        values = []
+        for item in children[0:-1]:
+            values.append(item.value)
+        arm = children[-1]
+
+        return _XdrCaseSpec(values, arm)
+
+    def default_spec(self, children):
+        """Instantiate one _XdrDefaultSpec object"""
+        arm = children[0]
+
+        return _XdrDefaultSpec(arm)
+
+    def union(self, children):
+        """Instantiate one _XdrUnion object"""
+        name = children[0].symbol
+        structs.add(name)
+        pass_by_reference.add(name)
+
+        body = children[1]
+        discriminant = body.children[0].children[0]
+        cases = body.children[1:-1]
+        default = body.children[-1]
+
+        return _XdrUnion(name, discriminant, cases, default)
+
+    def procedure_def(self, children):
+        """Instantiate one _RpcProcedure object"""
+        result = children[0]
+        name = children[1].symbol
+        argument = children[2]
+        number = children[3].value
+
+        return _RpcProcedure(name, number, argument, result)
+
+    def version_def(self, children):
+        """Instantiate one _RpcVersion object"""
+        name = children[0].symbol
+        number = children[-1].value
+        procedures = children[1:-1]
+
+        return _RpcVersion(name, number, procedures)
+
+    def program_def(self, children):
+        """Instantiate one _RpcProgram object"""
+        name = children[0].symbol
+        number = children[-1].value
+        versions = children[1:-1]
+
+        return _RpcProgram(name, number, versions)
+
+    def pragma_def(self, children):
+        """Instantiate one _Pragma object"""
+        directive = children[0].children[0].data
+        match directive:
+            case "exclude_directive":
+                excluded_apis.append(children[1].symbol)
+            case "header_directive":
+                global header_name
+                header_name = children[1].symbol
+            case "public_directive":
+                public_apis.append(children[1].symbol)
+            case _:
+                raise NotImplementedError("Directive not supported")
+        return _Pragma()
+
+
+transformer = ast_utils.create_transformer(this_module, ParseToAst())
+
+
+def transform_parse_tree(parse_tree):
+    """Transform productions into an abstract syntax tree"""
+
+    return transformer.transform(parse_tree)
+
+
+def get_header_name() -> str:
+    """Return header name set by pragma header directive"""
+    return header_name
diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py
new file mode 100644
index 000000000000..964b44e675df
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/xdr_parse.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Common parsing code for xdrgen"""
+
+from lark import Lark
+
+
+# Set to True to emit annotation comments in generated source
+annotate = False
+
+
+def set_xdr_annotate(set_it: bool) -> None:
+    """Set 'annotate' if --annotate was specified on the command line"""
+    global annotate
+    annotate = set_it
+
+
+def get_xdr_annotate() -> bool:
+    """Return True if --annotate was specified on the command line"""
+    return annotate
+
+
+def xdr_parser() -> Lark:
+    """Return a Lark parser instance configured with the XDR language grammar"""
+
+    return Lark.open(
+        "grammars/xdr.lark",
+        rel_to=__file__,
+        start="specification",
+        debug=True,
+        strict=True,
+        propagate_positions=True,
+        parser="lalr",
+        lexer="contextual",
+    )
diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen
new file mode 100755
index 000000000000..95f303b2861b
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/xdrgen
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Translate an XDR specification into executable code that
+can be compiled for the Linux kernel."""
+
+__author__ = "Chuck Lever"
+__copyright__ = "Copyright (c) 2024 Oracle and/or its affiliates."
+__license__ = "GPL-2.0 only"
+__version__ = "0.2"
+
+import sys
+import argparse
+
+from subcmds import definitions
+from subcmds import declarations
+from subcmds import lint
+from subcmds import source
+
+
+sys.path.insert(1, "@pythondir@")
+
+
+def main() -> int:
+    """Parse command-line options"""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="Convert an XDR specification to Linux kernel source code",
+        epilog="""\
+Copyright (c) 2024 Oracle and/or its affiliates.
+
+License GPLv2: <http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt>
+This is free software.  You are free to change and redistribute it.
+There is NO WARRANTY, to the extent permitted by law.""",
+    )
+    parser.add_argument(
+        "--version",
+        help="Display the version of this tool",
+        action="version",
+        version=__version__,
+    )
+
+    subcommands = parser.add_subparsers(title="Subcommands", required=True)
+
+    definitions_parser = subcommands.add_parser(
+        "definitions", help="Generate XDR definitions"
+    )
+    definitions_parser.add_argument(
+        "--annotate",
+        action="store_true",
+        default=False,
+        help="Add annotation comments",
+    )
+    definitions_parser.add_argument(
+        "--language",
+        action="store_true",
+        default="C",
+        help="Output language",
+    )
+    definitions_parser.add_argument(
+        "--peer",
+        choices=["server", "client",],
+        default="server",
+        help="Generate header code for client or server side",
+        type=str,
+    )
+    definitions_parser.add_argument("filename", help="File containing an XDR specification")
+    definitions_parser.set_defaults(func=definitions.subcmd)
+
+    declarations_parser = subcommands.add_parser(
+        "declarations", help="Generate function declarations"
+    )
+    declarations_parser.add_argument(
+        "--annotate",
+        action="store_true",
+        default=False,
+        help="Add annotation comments",
+    )
+    declarations_parser.add_argument(
+        "--language",
+        action="store_true",
+        default="C",
+        help="Output language",
+    )
+    declarations_parser.add_argument(
+        "--peer",
+        choices=["server", "client",],
+        default="server",
+        help="Generate code for client or server side",
+        type=str,
+    )
+    declarations_parser.add_argument("filename", help="File containing an XDR specification")
+    declarations_parser.set_defaults(func=declarations.subcmd)
+
+    linter_parser = subcommands.add_parser("lint", help="Check an XDR specification")
+    linter_parser.add_argument("filename", help="File containing an XDR specification")
+    linter_parser.set_defaults(func=lint.subcmd)
+
+    source_parser = subcommands.add_parser(
+        "source", help="Generate XDR encoder and decoder source code"
+    )
+    source_parser.add_argument(
+        "--annotate",
+        action="store_true",
+        default=False,
+        help="Add annotation comments",
+    )
+    source_parser.add_argument(
+        "--language",
+        action="store_true",
+        default="C",
+        help="Output language",
+    )
+    source_parser.add_argument(
+        "--peer",
+        choices=["server", "client",],
+        default="server",
+        help="Generate code for client or server side",
+        type=str,
+    )
+    source_parser.add_argument("filename", help="File containing an XDR specification")
+    source_parser.set_defaults(func=source.subcmd)
+
+    args = parser.parse_args()
+    return args.func(args)
+
+
+try:
+    if __name__ == "__main__":
+        sys.exit(main())
+except (SystemExit, KeyboardInterrupt, BrokenPipeError):
+    sys.exit(1)
-- 
cgit v1.2.3


From 663ad8b1df8724cd5e01df66ea67ce0424fbcdf6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 10 Sep 2024 15:31:19 -0400
Subject: xdrgen: Fix return code checking in built-in XDR decoders

xdr_stream_encode_u32() returns XDR_UNIT on success.
xdr_stream_decode_u32() returns zero or -EMSGSIZE, but never
XDR_UNIT.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdrgen/_builtins.h                               | 4 ++--
 .../xdrgen/templates/C/pointer/decoder/variable_length_array.j2       | 2 +-
 .../sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2 | 2 +-
 .../sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h
index 68746c59fc9a..66ca3ece951a 100644
--- a/include/linux/sunrpc/xdrgen/_builtins.h
+++ b/include/linux/sunrpc/xdrgen/_builtins.h
@@ -184,7 +184,7 @@ xdrgen_decode_string(struct xdr_stream *xdr, string *ptr, u32 maxlen)
 	__be32 *p;
 	u32 len;
 
-	if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT))
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
 		return false;
 	if (unlikely(maxlen && len > maxlen))
 		return false;
@@ -215,7 +215,7 @@ xdrgen_decode_opaque(struct xdr_stream *xdr, opaque *ptr, u32 maxlen)
 	__be32 *p;
 	u32 len;
 
-	if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT))
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
 		return false;
 	if (unlikely(maxlen && len > maxlen))
 		return false;
diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_array.j2
index f54ccd136762..2f943909cdf7 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_array.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/variable_length_array.j2
@@ -2,7 +2,7 @@
 {% if annotate %}
 	/* member {{ name }} (variable-length array) */
 {% endif %}
-	if (xdr_stream_decode_u32(xdr, &ptr->{{ name }}.count) != XDR_UNIT)
+	if (xdr_stream_decode_u32(xdr, &ptr->{{ name }}.count) < 0)
 		return false;
 {% if maxsize != "0" %}
 	if (ptr->{{ name }}.count > {{ maxsize }})
diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2
index f54ccd136762..2f943909cdf7 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_array.j2
@@ -2,7 +2,7 @@
 {% if annotate %}
 	/* member {{ name }} (variable-length array) */
 {% endif %}
-	if (xdr_stream_decode_u32(xdr, &ptr->{{ name }}.count) != XDR_UNIT)
+	if (xdr_stream_decode_u32(xdr, &ptr->{{ name }}.count) < 0)
 		return false;
 {% if maxsize != "0" %}
 	if (ptr->{{ name }}.count > {{ maxsize }})
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2
index eee2b9a68e27..51ad736d2530 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/variable_length_array.j2
@@ -2,7 +2,7 @@
 {% if annotate %}
 		/* member {{ name }} (variable-length array) */
 {% endif %}
-		if (xdr_stream_decode_u32(xdr, &count) != XDR_UNIT)
+		if (xdr_stream_decode_u32(xdr, &count) < 0)
 			return false;
 		if (count > {{ maxsize }})
 			return false;
-- 
cgit v1.2.3


From fed8a17c61ffa2ba53dc749068b6f07ecf40e3bf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 10 Sep 2024 15:51:46 -0400
Subject: xdrgen: typedefs should use the built-in string and opaque functions

'typedef opaque yada<XYZ>' should use xdrgen's built-in opaque
encoder and decoder, to enable better compiler optimization.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 .../sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 | 2 +-
 .../sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2
index c1b7ad84f99c..f28f8b228ad5 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2
@@ -13,5 +13,5 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr
 {% if annotate %}
 	/* (variable-length opaque) */
 {% endif %}
-	return xdr_stream_decode_opaque(xdr, ptr->data, ptr->len) >= 0;
+	return xdrgen_decode_opaque(xdr, ptr, {{ maxsize }});
 };
diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2
index 937286d76688..56c5a17d6a70 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_string.j2
@@ -13,5 +13,5 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr
 {% if annotate %}
 	/* (variable-length string) */
 {% endif %}
-	return xdr_stream_decode_opaque(xdr, ptr->data, ptr->len) >= 0;
+	return xdrgen_decode_string(xdr, ptr, {{ maxsize }});
 };
-- 
cgit v1.2.3


From 509abfc7a0ba66afa648e8216306acdc55ec54ed Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 13 Sep 2024 13:50:56 -0400
Subject: xdrgen: Prevent reordering of encoder and decoder functions

I noticed that "xdrgen source" reorders the procedure encoder and
decoder functions every time it is run. I would prefer that the
generated code be more deterministic: it enables a reader to better
see exactly what has changed between runs of the tool.

The problem is that Python sets are not ordered. I use a Python set
to ensure that, when multiple procedures use a particular argument or
result type, the encoder/decoder for that type is emitted only once.

Sets aren't ordered, but I can use Python dictionaries for this
purpose to ensure the procedure functions are always emitted in the
same order if the .x file does not change.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/generators/program.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/generators/program.py b/tools/net/sunrpc/xdrgen/generators/program.py
index 83b0ecbae86f..ac3cf1694b68 100644
--- a/tools/net/sunrpc/xdrgen/generators/program.py
+++ b/tools/net/sunrpc/xdrgen/generators/program.py
@@ -34,20 +34,20 @@ def emit_version_declarations(
     environment: Environment, program: str, version: _RpcVersion
 ) -> None:
     """Emit declarations for each RPC version's procedures"""
-    arguments = set()
+    arguments = dict.fromkeys([])
     for procedure in version.procedures:
         if procedure.name not in excluded_apis:
-            arguments.add(procedure.argument.type_name)
+            arguments[procedure.argument.type_name] = None
     if len(arguments) > 0:
         print("")
         template = environment.get_template("declaration/argument.j2")
         for argument in arguments:
             print(template.render(program=program, argument=argument))
 
-    results = set()
+    results = dict.fromkeys([])
     for procedure in version.procedures:
         if procedure.name not in excluded_apis:
-            results.add(procedure.result.type_name)
+            results[procedure.result.type_name] = None
     if len(results) > 0:
         print("")
         template = environment.get_template("declaration/result.j2")
@@ -59,10 +59,10 @@ def emit_version_argument_decoders(
     environment: Environment, program: str, version: _RpcVersion
 ) -> None:
     """Emit server argument decoders for each RPC version's procedures"""
-    arguments = set()
+    arguments = dict.fromkeys([])
     for procedure in version.procedures:
         if procedure.name not in excluded_apis:
-            arguments.add(procedure.argument.type_name)
+            arguments[procedure.argument.type_name] = None
 
     template = environment.get_template("decoder/argument.j2")
     for argument in arguments:
@@ -73,10 +73,10 @@ def emit_version_result_decoders(
     environment: Environment, program: str, version: _RpcVersion
 ) -> None:
     """Emit client result decoders for each RPC version's procedures"""
-    results = set()
+    results = dict.fromkeys([])
     for procedure in version.procedures:
         if procedure.name not in excluded_apis:
-            results.add(procedure.result.type_name)
+            results[procedure.result.type_name] = None
 
     template = environment.get_template("decoder/result.j2")
     for result in results:
@@ -87,10 +87,10 @@ def emit_version_argument_encoders(
     environment: Environment, program: str, version: _RpcVersion
 ) -> None:
     """Emit client argument encoders for each RPC version's procedures"""
-    arguments = set()
+    arguments = dict.fromkeys([])
     for procedure in version.procedures:
         if procedure.name not in excluded_apis:
-            arguments.add(procedure.argument.type_name)
+            arguments[procedure.argument.type_name] = None
 
     template = environment.get_template("encoder/argument.j2")
     for argument in arguments:
@@ -101,10 +101,10 @@ def emit_version_result_encoders(
     environment: Environment, program: str, version: _RpcVersion
 ) -> None:
     """Emit server result encoders for each RPC version's procedures"""
-    results = set()
+    results = dict.fromkeys([])
     for procedure in version.procedures:
         if procedure.name not in excluded_apis:
-            results.add(procedure.result.type_name)
+            results[procedure.result.type_name] = None
 
     template = environment.get_template("encoder/result.j2")
     for result in results:
-- 
cgit v1.2.3


From 6c56fb4434f59df9c777eded5f77cc812882cef3 Mon Sep 17 00:00:00 2001
From: "John B. Wyatt IV" <jwyatt@redhat.com>
Date: Thu, 19 Sep 2024 15:56:24 -0400
Subject: pm: cpupower: Clean up bindings gitignore

Add SPDX identifier to the gitignore. Remove the comment and .i file
since the file it references was removed in another patch. This patch
depends on Min-Hua Chen's 'pm: cpupower: rename raw_pylibcpupower.i'.

Signed-off-by: John B. Wyatt IV <jwyatt@redhat.com>
Signed-off-by: John B. Wyatt IV <sageofredondo@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/bindings/python/.gitignore | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/bindings/python/.gitignore b/tools/power/cpupower/bindings/python/.gitignore
index 5c9a1f0212dd..51cbb8799c44 100644
--- a/tools/power/cpupower/bindings/python/.gitignore
+++ b/tools/power/cpupower/bindings/python/.gitignore
@@ -1,8 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
 __pycache__/
 raw_pylibcpupower_wrap.c
 *.o
 *.so
 *.py
 !test_raw_pylibcpupower.py
-# git keeps ignoring this file, use git add -f raw_libcpupower.i
-!raw_pylibcpupower.i
-- 
cgit v1.2.3


From e08ec26928554c36e34e089f663dc9114d77b68c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Thu, 19 Sep 2024 14:40:08 +0200
Subject: tools: Add additional SYM_*() stubs to linkage.h

Similar to commit f8d92fc527ff ("selftests: vDSO: fix include order in
build of test_vdso_chacha") add SYM_DATA_START, SYM_DATA_START_LOCAL, and
SYM_DATA_END stubs to tools/include/linux/linkage.h so that the proper
macros can be used within the kernel's vdso code as well as in the
vdso_test_chacha selftest.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 tools/include/linux/linkage.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h
index a48ff086899c..a89620c550ed 100644
--- a/tools/include/linux/linkage.h
+++ b/tools/include/linux/linkage.h
@@ -2,7 +2,9 @@
 #define _TOOLS_INCLUDE_LINUX_LINKAGE_H
 
 #define SYM_FUNC_START(x) .globl x; x:
-
 #define SYM_FUNC_END(x)
+#define SYM_DATA_START(x) .globl x; x:
+#define SYM_DATA_START_LOCAL(x) x:
+#define SYM_DATA_END(x)
 
 #endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */
-- 
cgit v1.2.3


From 431844b65f4c1b988ccd886f2ed29c138f7bb262 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@linux.dev>
Date: Sat, 21 Sep 2024 21:39:21 +0200
Subject: sched_ext: Provide a sysfs enable_seq counter

As discussed during the distro-centric session within the sched_ext
Microconference at LPC 2024, introduce a sequence counter that is
incremented every time a BPF scheduler is loaded.

This feature can help distributions in diagnosing potential performance
regressions by identifying systems where users are running (or have ran)
custom BPF schedulers.

Example:

 arighi@virtme-ng~> cat /sys/kernel/sched_ext/enable_seq
 0
 arighi@virtme-ng~> sudo scx_simple
 local=1 global=0
 ^CEXIT: unregistered from user space
 arighi@virtme-ng~> cat /sys/kernel/sched_ext/enable_seq
 1

In this way user-space tools (such as Ubuntu's apport and similar) are
able to gather and include this information in bug reports.

Cc: Giovanni Gherdovich <giovanni.gherdovich@suse.com>
Cc: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
Cc: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
Cc: Phil Auld <pauld@redhat.com>
Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 10 ++++++++++
 kernel/sched/ext.c                    | 17 +++++++++++++++++
 tools/sched_ext/scx_show_state.py     |  1 +
 3 files changed, 28 insertions(+)

(limited to 'tools')

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index a707d2181a77..6c0d70e2e27d 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -83,6 +83,15 @@ The current status of the BPF scheduler can be determined as follows:
     # cat /sys/kernel/sched_ext/root/ops
     simple
 
+You can check if any BPF scheduler has ever been loaded since boot by examining
+this monotonically incrementing counter (a value of zero indicates that no BPF
+scheduler has been loaded):
+
+.. code-block:: none
+
+    # cat /sys/kernel/sched_ext/enable_seq
+    1
+
 ``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more
 detailed information:
 
@@ -96,6 +105,7 @@ detailed information:
     enable_state  : enabled (2)
     bypass_depth  : 0
     nr_rejected   : 0
+    enable_seq    : 1
 
 If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can
 be determined as follows:
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7c320dcd72d5..c09e3dc38c34 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -874,6 +874,13 @@ static struct scx_exit_info *scx_exit_info;
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 
+/*
+ * A monotically increasing sequence number that is incremented every time a
+ * scheduler is enabled. This can be used by to check if any custom sched_ext
+ * scheduler has ever been used in the system.
+ */
+static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
+
 /*
  * The maximum amount of time in jiffies that a task may be runnable without
  * being scheduled on a CPU. If this timeout is exceeded, it will trigger
@@ -4154,11 +4161,19 @@ static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
 }
 SCX_ATTR(hotplug_seq);
 
+static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
+					struct kobj_attribute *ka, char *buf)
+{
+	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
+}
+SCX_ATTR(enable_seq);
+
 static struct attribute *scx_global_attrs[] = {
 	&scx_attr_state.attr,
 	&scx_attr_switch_all.attr,
 	&scx_attr_nr_rejected.attr,
 	&scx_attr_hotplug_seq.attr,
+	&scx_attr_enable_seq.attr,
 	NULL,
 };
 
@@ -5177,6 +5192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	kobject_uevent(scx_root_kobj, KOBJ_ADD);
 	mutex_unlock(&scx_ops_enable_mutex);
 
+	atomic_long_inc(&scx_enable_seq);
+
 	return 0;
 
 err_del:
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index d457d2a74e1e..8bc626ede1c4 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -37,3 +37,4 @@ print(f'switched_all  : {read_static_key("__scx_switched_all")}')
 print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
 print(f'bypass_depth  : {read_atomic("scx_ops_bypass_depth")}')
 print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
+print(f'enable_seq    : {read_atomic("scx_enable_seq")}')
-- 
cgit v1.2.3


From 57a7e6de9e30cb40fd4b45e24e9eefedb84cdde5 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sun, 18 Aug 2024 19:43:26 +0900
Subject: tracing/fprobe: Support raw tracepoints on future loaded modules

Support raw tracepoint events on future loaded (unloaded) modules.
This allows user to create raw tracepoint events which can be used from
module's __init functions.

Note: since the kernel does not have any information about the tracepoints
in the unloaded modules, fprobe events can not check whether the tracepoint
exists nor extend the BTF based arguments.

Link: https://lore.kernel.org/all/172397780593.286558.18360375226968537828.stgit@devnote2/

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_fprobe.c                        | 151 ++++++++++++++-------
 .../ftrace/test.d/dynevent/tprobe_syntax_errors.tc |   1 -
 2 files changed, 101 insertions(+), 51 deletions(-)

(limited to 'tools')

diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 8b1127e37da5..a079abd8955b 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -21,6 +21,7 @@
 #define FPROBE_EVENT_SYSTEM "fprobes"
 #define TRACEPOINT_EVENT_SYSTEM "tracepoints"
 #define RETHOOK_MAXACTIVE_MAX 4096
+#define TRACEPOINT_STUB ERR_PTR(-ENOENT)
 
 static int trace_fprobe_create(const char *raw_command);
 static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -674,6 +675,24 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
 	return trace_probe_unregister_event_call(&tf->tp);
 }
 
+static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
+{
+	struct tracepoint *tpoint = tf->tpoint;
+	unsigned long ip = (unsigned long)tpoint->probestub;
+	int ret;
+
+	/*
+	 * Here, we do 2 steps to enable fprobe on a tracepoint.
+	 * At first, put __probestub_##TP function on the tracepoint
+	 * and put a fprobe on the stub function.
+	 */
+	ret = tracepoint_probe_register_prio_may_exist(tpoint,
+				tpoint->probestub, NULL, 0);
+	if (ret < 0)
+		return ret;
+	return register_fprobe_ips(&tf->fp, &ip, 1);
+}
+
 /* Internal register function - just handle fprobe and flags */
 static int __register_trace_fprobe(struct trace_fprobe *tf)
 {
@@ -700,18 +719,12 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
 		tf->fp.flags |= FPROBE_FL_DISABLED;
 
 	if (trace_fprobe_is_tracepoint(tf)) {
-		struct tracepoint *tpoint = tf->tpoint;
-		unsigned long ip = (unsigned long)tpoint->probestub;
-		/*
-		 * Here, we do 2 steps to enable fprobe on a tracepoint.
-		 * At first, put __probestub_##TP function on the tracepoint
-		 * and put a fprobe on the stub function.
-		 */
-		ret = tracepoint_probe_register_prio_may_exist(tpoint,
-					tpoint->probestub, NULL, 0);
-		if (ret < 0)
-			return ret;
-		return register_fprobe_ips(&tf->fp, &ip, 1);
+
+		/* This tracepoint is not loaded yet */
+		if (tf->tpoint == TRACEPOINT_STUB)
+			return 0;
+
+		return __regsiter_tracepoint_fprobe(tf);
 	}
 
 	/* TODO: handle filter, nofilter or symbol list */
@@ -864,36 +877,6 @@ end:
 	return ret;
 }
 
-#ifdef CONFIG_MODULES
-static int __tracepoint_probe_module_cb(struct notifier_block *self,
-					unsigned long val, void *data)
-{
-	struct tp_module *tp_mod = data;
-	struct trace_fprobe *tf;
-	struct dyn_event *pos;
-
-	if (val != MODULE_STATE_GOING)
-		return NOTIFY_DONE;
-
-	mutex_lock(&event_mutex);
-	for_each_trace_fprobe(tf, pos) {
-		if (tp_mod->mod == tf->mod) {
-			tracepoint_probe_unregister(tf->tpoint,
-					tf->tpoint->probestub, NULL);
-			tf->tpoint = NULL;
-			tf->mod = NULL;
-		}
-	}
-	mutex_unlock(&event_mutex);
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block tracepoint_module_nb = {
-	.notifier_call = __tracepoint_probe_module_cb,
-};
-#endif /* CONFIG_MODULES */
-
 struct __find_tracepoint_cb_data {
 	const char *tp_name;
 	struct tracepoint *tpoint;
@@ -906,10 +889,12 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo
 
 	if (!data->tpoint && !strcmp(data->tp_name, tp->name)) {
 		data->tpoint = tp;
-		data->mod = mod;
-		if (!try_module_get(data->mod)) {
-			data->tpoint = NULL;
-			data->mod = NULL;
+		if (!data->mod) {
+			data->mod = mod;
+			if (!try_module_get(data->mod)) {
+				data->tpoint = NULL;
+				data->mod = NULL;
+			}
 		}
 	}
 }
@@ -947,6 +932,67 @@ static struct tracepoint *find_tracepoint(const char *tp_name,
 	return data.tpoint;
 }
 
+#ifdef CONFIG_MODULES
+static void reenable_trace_fprobe(struct trace_fprobe *tf)
+{
+	struct trace_probe *tp = &tf->tp;
+
+	list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+		__enable_trace_fprobe(tf);
+	}
+}
+
+static struct tracepoint *find_tracepoint_in_module(struct module *mod,
+						    const char *tp_name)
+{
+	struct __find_tracepoint_cb_data data = {
+		.tp_name = tp_name,
+		.mod = mod,
+	};
+
+	for_each_tracepoint_in_module(mod, __find_tracepoint_module_cb, &data);
+	return data.tpoint;
+}
+
+static int __tracepoint_probe_module_cb(struct notifier_block *self,
+					unsigned long val, void *data)
+{
+	struct tp_module *tp_mod = data;
+	struct tracepoint *tpoint;
+	struct trace_fprobe *tf;
+	struct dyn_event *pos;
+
+	if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
+		return NOTIFY_DONE;
+
+	mutex_lock(&event_mutex);
+	for_each_trace_fprobe(tf, pos) {
+		if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) {
+			tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol);
+			if (tpoint) {
+				tf->tpoint = tpoint;
+				tf->mod = tp_mod->mod;
+				if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
+				    trace_probe_is_enabled(&tf->tp))
+					reenable_trace_fprobe(tf);
+			}
+		} else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) {
+			tracepoint_probe_unregister(tf->tpoint,
+					tf->tpoint->probestub, NULL);
+			tf->tpoint = NULL;
+			tf->mod = NULL;
+		}
+	}
+	mutex_unlock(&event_mutex);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block tracepoint_module_nb = {
+	.notifier_call = __tracepoint_probe_module_cb,
+};
+#endif /* CONFIG_MODULES */
+
 static int parse_symbol_and_return(int argc, const char *argv[],
 				   char **symbol, bool *is_return,
 				   bool is_tracepoint)
@@ -1113,14 +1159,19 @@ static int __trace_fprobe_create(int argc, const char *argv[])
 	if (is_tracepoint) {
 		ctx.flags |= TPARG_FL_TPOINT;
 		tpoint = find_tracepoint(symbol, &tp_mod);
-		if (!tpoint) {
+		if (tpoint) {
+			ctx.funcname = kallsyms_lookup(
+				(unsigned long)tpoint->probestub,
+				NULL, NULL, NULL, sbuf);
+		} else if (IS_ENABLED(CONFIG_MODULES)) {
+				/* This *may* be loaded afterwards */
+				tpoint = TRACEPOINT_STUB;
+				ctx.funcname = symbol;
+		} else {
 			trace_probe_log_set_index(1);
 			trace_probe_log_err(0, NO_TRACEPOINT);
 			goto parse_error;
 		}
-		ctx.funcname = kallsyms_lookup(
-				(unsigned long)tpoint->probestub,
-				NULL, NULL, NULL, sbuf);
 	} else
 		ctx.funcname = symbol;
 
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/tprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/tprobe_syntax_errors.tc
index da117b8f1d12..ffe8ffef4027 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/tprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/tprobe_syntax_errors.tc
@@ -9,7 +9,6 @@ check_error() { # command-with-error-pos-by-^
 
 check_error 't^100 kfree'		# BAD_MAXACT_TYPE
 
-check_error 't ^non_exist_tracepoint'	# NO_TRACEPOINT
 check_error 't:^/bar kfree'		# NO_GROUP_NAME
 check_error 't:^12345678901234567890123456789012345678901234567890123456789012345/bar kfree'	# GROUP_TOO_LONG
 
-- 
cgit v1.2.3


From 4e78dd6b4c27c5a6d057f179ff6c1ddd75a7a2ab Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sun, 18 Aug 2024 19:43:35 +0900
Subject: sefltests/tracing: Add a test for tracepoint events on modules

Add a test case for tracepoint events on modules. This checks if it can add
and remove the events correctly.

Link: https://lore.kernel.org/all/172397781494.286558.7581515061075998225.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 tools/testing/selftests/ftrace/config              |  1 +
 .../test.d/dynevent/add_remove_tprobe_module.tc    | 61 ++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_tprobe_module.tc

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/config b/tools/testing/selftests/ftrace/config
index 048a312abf40..544de0db5f58 100644
--- a/tools/testing/selftests/ftrace/config
+++ b/tools/testing/selftests/ftrace/config
@@ -20,6 +20,7 @@ CONFIG_PREEMPT_TRACER=y
 CONFIG_PROBE_EVENTS_BTF_ARGS=y
 CONFIG_SAMPLES=y
 CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_TRACE_EVENTS=m
 CONFIG_SAMPLE_TRACE_PRINTK=m
 CONFIG_SCHED_TRACER=y
 CONFIG_STACK_TRACER=y
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_tprobe_module.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_tprobe_module.tc
new file mode 100644
index 000000000000..d319d5ed4226
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_tprobe_module.tc
@@ -0,0 +1,61 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - add/remove tracepoint probe events on module
+# requires: dynamic_events "t[:[<group>/][<event>]] <tracepoint> [<args>]":README
+
+rmmod trace-events-sample ||:
+if ! modprobe trace-events-sample ; then
+  echo "No trace-events sample module - please make CONFIG_SAMPLE_TRACE_EVENTS=m"
+  exit_unresolved;
+fi
+trap "rmmod trace-events-sample" EXIT
+
+echo 0 > events/enable
+echo > dynamic_events
+
+TRACEPOINT1=foo_bar
+TRACEPOINT2=foo_bar_with_cond
+
+echo "t:myevent1 $TRACEPOINT1" >> dynamic_events
+echo "t:myevent2 $TRACEPOINT2" >> dynamic_events
+
+grep -q myevent1 dynamic_events
+grep -q myevent2 dynamic_events
+test -d events/tracepoints/myevent1
+test -d events/tracepoints/myevent2
+
+echo "-:myevent2" >> dynamic_events
+
+grep -q myevent1 dynamic_events
+! grep -q myevent2 dynamic_events
+
+echo > dynamic_events
+
+clear_trace
+
+:;: "Try to put a probe on a tracepoint in non-loaded module" ;:
+rmmod trace-events-sample
+
+echo "t:myevent1 $TRACEPOINT1" >> dynamic_events
+echo "t:myevent2 $TRACEPOINT2" >> dynamic_events
+
+grep -q myevent1 dynamic_events
+grep -q myevent2 dynamic_events
+test -d events/tracepoints/myevent1
+test -d events/tracepoints/myevent2
+
+echo 1 > events/tracepoints/enable
+
+modprobe trace-events-sample
+
+sleep 2
+
+grep -q "myevent1" trace
+grep -q "myevent2" trace
+
+rmmod trace-events-sample
+trap "" EXIT
+
+echo 0 > events/tracepoints/enable
+echo > dynamic_events
+clear_trace
-- 
cgit v1.2.3


From 72ef07554c5dcabb0053a147c4fd221a8e39bcfd Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 19 Sep 2024 08:43:42 -0400
Subject: selftests/net: packetdrill: increase timing tolerance in debug mode

Some packetdrill tests are flaky in debug mode. As discussed, increase
tolerance.

We have been doing this for debug builds outside ksft too.

Previous setting was 10000. A manual 50 runs in virtme-ng showed two
failures that needed 12000. To be on the safe side, Increase to 14000.

Link: https://lore.kernel.org/netdev/Zuhhe4-MQHd3EkfN@mini-arch/
Fixes: 1e42f73fd3c2 ("selftests/net: packetdrill: import tcp/zerocopy")
Reported-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20240919124412.3014326-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/packetdrill/ksft_runner.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
index 7478c0c0c9aa..4071c133f29e 100755
--- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -30,12 +30,17 @@ if [ -z "$(which packetdrill)" ]; then
 	exit "$KSFT_SKIP"
 fi
 
+declare -a optargs
+if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then
+	optargs+=('--tolerance_usecs=14000')
+fi
+
 ktap_print_header
 ktap_set_plan 2
 
-unshare -n packetdrill ${ipv4_args[@]} $(basename $script) > /dev/null \
+unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $(basename $script) > /dev/null \
 	&& ktap_test_pass "ipv4" || ktap_test_fail "ipv4"
-unshare -n packetdrill ${ipv6_args[@]} $(basename $script) > /dev/null \
+unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $(basename $script) > /dev/null \
 	&& ktap_test_pass "ipv6" || ktap_test_fail "ipv6"
 
 ktap_finished
-- 
cgit v1.2.3


From a57856c0bbc238779e56ec9e48a7ba8e06d8bebf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 10 Sep 2024 11:38:16 +0200
Subject: selftests: netfilter: add reverse-clash resolution test case

Add test program that is sending UDP packets in both directions
and check that packets arrive without source port modification.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/net/netfilter/Makefile     |   2 +
 .../net/netfilter/conntrack_reverse_clash.c        | 125 +++++++++++++++++++++
 .../net/netfilter/conntrack_reverse_clash.sh       |  51 +++++++++
 3 files changed, 178 insertions(+)
 create mode 100644 tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c
 create mode 100755 tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index d13fb5ea3e89..98535c60b195 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -13,6 +13,7 @@ TEST_PROGS += conntrack_ipip_mtu.sh
 TEST_PROGS += conntrack_tcp_unreplied.sh
 TEST_PROGS += conntrack_sctp_collision.sh
 TEST_PROGS += conntrack_vrf.sh
+TEST_PROGS += conntrack_reverse_clash.sh
 TEST_PROGS += ipvs.sh
 TEST_PROGS += nf_conntrack_packetdrill.sh
 TEST_PROGS += nf_nat_edemux.sh
@@ -36,6 +37,7 @@ TEST_GEN_PROGS = conntrack_dump_flush
 
 TEST_GEN_FILES = audit_logread
 TEST_GEN_FILES += connect_close nf_queue
+TEST_GEN_FILES += conntrack_reverse_clash
 TEST_GEN_FILES += sctp_collision
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c
new file mode 100644
index 000000000000..507930cee8cb
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Needs something like:
+ *
+ * iptables -t nat -A POSTROUTING -o nomatch -j MASQUERADE
+ *
+ * so NAT engine attaches a NAT null-binding to each connection.
+ *
+ * With unmodified kernels, child or parent will exit with
+ * "Port number changed" error, even though no port translation
+ * was requested.
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+
+#define LEN 512
+#define PORT 56789
+#define TEST_TIME 5
+
+static void die(const char *e)
+{
+	perror(e);
+	exit(111);
+}
+
+static void die_port(uint16_t got, uint16_t want)
+{
+	fprintf(stderr, "Port number changed, wanted %d got %d\n", want, ntohs(got));
+	exit(1);
+}
+
+static int udp_socket(void)
+{
+	static const struct timeval tv = {
+		.tv_sec = 1,
+	};
+	int fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+
+	if (fd < 0)
+		die("socket");
+
+	setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
+	return fd;
+}
+
+int main(int argc, char *argv[])
+{
+	struct sockaddr_in sa1 = {
+		.sin_family = AF_INET,
+	};
+	struct sockaddr_in sa2 = {
+		.sin_family = AF_INET,
+	};
+	int s1, s2, status;
+	time_t end, now;
+	socklen_t plen;
+	char buf[LEN];
+	bool child;
+
+	sa1.sin_port = htons(PORT);
+	sa2.sin_port = htons(PORT + 1);
+
+	s1 = udp_socket();
+	s2 = udp_socket();
+
+	inet_pton(AF_INET, "127.0.0.11", &sa1.sin_addr);
+	inet_pton(AF_INET, "127.0.0.12", &sa2.sin_addr);
+
+	if (bind(s1, (struct sockaddr *)&sa1, sizeof(sa1)) < 0)
+		die("bind 1");
+	if (bind(s2, (struct sockaddr *)&sa2, sizeof(sa2)) < 0)
+		die("bind 2");
+
+	child = fork() == 0;
+
+	now = time(NULL);
+	end = now + TEST_TIME;
+
+	while (now < end) {
+		struct sockaddr_in peer;
+		socklen_t plen = sizeof(peer);
+
+		now = time(NULL);
+
+		if (child) {
+			if (sendto(s1, buf, LEN, 0, (struct sockaddr *)&sa2, sizeof(sa2)) != LEN)
+				continue;
+
+			if (recvfrom(s2, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0)
+				die("child recvfrom");
+
+			if (peer.sin_port != htons(PORT))
+				die_port(peer.sin_port, PORT);
+		} else {
+			if (sendto(s2, buf, LEN, 0, (struct sockaddr *)&sa1, sizeof(sa1)) != LEN)
+				continue;
+
+			if (recvfrom(s1, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0)
+				die("parent recvfrom");
+
+			if (peer.sin_port != htons((PORT + 1)))
+				die_port(peer.sin_port, PORT + 1);
+		}
+	}
+
+	if (child)
+		return 0;
+
+	wait(&status);
+
+	if (WIFEXITED(status))
+		return WEXITSTATUS(status);
+
+	return 1;
+}
diff --git a/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh
new file mode 100755
index 000000000000..a24c896347a8
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+cleanup()
+{
+	cleanup_all_ns
+}
+
+checktool "nft --version" "run test without nft"
+checktool "conntrack --version" "run test without conntrack"
+
+trap cleanup EXIT
+
+setup_ns ns0
+
+# make loopback connections get nat null bindings assigned
+ip netns exec "$ns0" nft -f - <<EOF
+table ip nat {
+        chain POSTROUTING {
+                type nat hook postrouting priority srcnat; policy accept;
+                oifname "nomatch" counter packets 0 bytes 0 masquerade
+        }
+}
+EOF
+
+do_flush()
+{
+	local end
+	local now
+
+	now=$(date +%s)
+	end=$((now + 5))
+
+	while [ $now -lt $end ];do
+		ip netns exec "$ns0" conntrack -F 2>/dev/null
+		now=$(date +%s)
+	done
+}
+
+do_flush &
+
+if ip netns exec "$ns0" ./conntrack_reverse_clash; then
+	echo "PASS: No SNAT performed for null bindings"
+else
+	echo "ERROR: SNAT performed without any matching snat rule"
+	exit 1
+fi
+
+exit 0
-- 
cgit v1.2.3


From 7e37e0eacd22c41e354e4b5d6d448b13a201954a Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Thu, 12 Sep 2024 06:17:54 +0000
Subject: selftests: netfilter: nft_tproxy.sh: add tcp tests

The TPROXY functionality is widely used, however, there are only mptcp
selftests covering this feature.

The selftests represent the most common scenarios and can also be used
as selfdocumentation of the feature.

UDP and TCP testcases are split in different files because of the
different nature of the protocols, specially due to the challenges that
present to reliable test UDP due to the connectionless nature of the
protocol. UDP only covers the scenarios involving the prerouting hook.

The UDP tests are signfinicantly slower than the TCP ones, hence they
use a larger timeout, it takes 20 seconds to run the full UDP suite
on a 48 vCPU Intel(R) Xeon(R) CPU @2.60GHz.

Signed-off-by: Antonio Ojea <aojea@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/net/netfilter/Makefile     |   2 +
 tools/testing/selftests/net/netfilter/config       |   1 +
 .../selftests/net/netfilter/nft_tproxy_tcp.sh      | 358 +++++++++++++++++++++
 .../selftests/net/netfilter/nft_tproxy_udp.sh      | 262 +++++++++++++++
 4 files changed, 623 insertions(+)
 create mode 100755 tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh
 create mode 100755 tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index 98535c60b195..e6c9e777fead 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -27,6 +27,8 @@ TEST_PROGS += nft_nat.sh
 TEST_PROGS += nft_nat_zones.sh
 TEST_PROGS += nft_queue.sh
 TEST_PROGS += nft_synproxy.sh
+TEST_PROGS += nft_tproxy_tcp.sh
+TEST_PROGS += nft_tproxy_udp.sh
 TEST_PROGS += nft_zones_many.sh
 TEST_PROGS += rpath.sh
 TEST_PROGS += xt_string.sh
diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config
index b2dd4db45215..c5fe7b34eaf1 100644
--- a/tools/testing/selftests/net/netfilter/config
+++ b/tools/testing/selftests/net/netfilter/config
@@ -81,6 +81,7 @@ CONFIG_NFT_QUEUE=m
 CONFIG_NFT_QUOTA=m
 CONFIG_NFT_REDIR=m
 CONFIG_NFT_SYNPROXY=m
+CONFIG_NFT_TPROXY=m
 CONFIG_VETH=m
 CONFIG_VLAN_8021Q=m
 CONFIG_XFRM_USER=m
diff --git a/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh b/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh
new file mode 100755
index 000000000000..e208fb03eeb7
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh
@@ -0,0 +1,358 @@
+#!/bin/bash
+#
+# This tests tproxy on the following scenario:
+#
+#                         +------------+
+# +-------+               |  nsrouter  |                  +-------+
+# |ns1    |.99          .1|            |.1             .99|    ns2|
+# |   eth0|---------------|veth0  veth1|------------------|eth0   |
+# |       |  10.0.1.0/24  |            |   10.0.2.0/24    |       |
+# +-------+  dead:1::/64  |    veth2   |   dead:2::/64    +-------+
+#                         +------------+
+#                                |.1
+#                                |
+#                                |
+#                                |                        +-------+
+#                                |                     .99|    ns3|
+#                                +------------------------|eth0   |
+#                                       10.0.3.0/24       |       |
+#                                       dead:3::/64       +-------+
+#
+# The tproxy implementation acts as an echo server so the client
+# must receive the same message it sent if it has been proxied.
+# If is not proxied the servers return PONG_NS# with the number
+# of the namespace the server is running.
+#
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+timeout=5
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+	ip netns pids "$ns3" | xargs kill 2>/dev/null
+	ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+setup_ns ns1 ns2 ns3 nsrouter
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+test_ping() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+	return 2
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then
+	return 2
+  fi
+
+  return 0
+}
+
+test_ping_router() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+	return 3
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+	return 4
+  fi
+
+  return 0
+}
+
+
+listener_ready()
+{
+	local ns="$1"
+	local port="$2"
+	local proto="$3"
+	ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
+test_tproxy()
+{
+	local traffic_origin="$1"
+	local ip_proto="$2"
+	local expect_ns1_ns2="$3"
+	local expect_ns1_ns3="$4"
+	local expect_nsrouter_ns2="$5"
+	local expect_nsrouter_ns3="$6"
+
+	# derived variables
+	local testname="test_${ip_proto}_tcp_${traffic_origin}"
+	local socat_ipproto
+	local ns1_ip
+	local ns2_ip
+	local ns3_ip
+	local ns2_target
+	local ns3_target
+	local nftables_subject
+	local ip_command
+
+	# socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1)
+	case $ip_proto in
+	"ip")
+		socat_ipproto="-4"
+		ns1_ip=10.0.1.99
+		ns2_ip=10.0.2.99
+		ns3_ip=10.0.3.99
+		ns2_target="tcp:$ns2_ip:8080"
+		ns3_target="tcp:$ns3_ip:8080"
+		nftables_subject="ip daddr $ns2_ip tcp dport 8080"
+		ip_command="ip"
+	;;
+	"ip6")
+		socat_ipproto="-6"
+		ns1_ip=dead:1::99
+		ns2_ip=dead:2::99
+		ns3_ip=dead:3::99
+		ns2_target="tcp:[$ns2_ip]:8080"
+		ns3_target="tcp:[$ns3_ip]:8080"
+		nftables_subject="ip6 daddr $ns2_ip tcp dport 8080"
+		ip_command="ip -6"
+	;;
+	*)
+	echo "FAIL: unsupported protocol"
+	exit 255
+	;;
+	esac
+
+	case $traffic_origin in
+	# to capture the local originated traffic we need to mark the outgoing
+	# traffic so the policy based routing rule redirects it and can be processed
+	# in the prerouting chain.
+	"local")
+		nftables_rules="
+flush ruleset
+table inet filter {
+	chain divert {
+		type filter hook prerouting priority 0; policy accept;
+		$nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept
+	}
+	chain output {
+		type route hook output priority 0; policy accept;
+		$nftables_subject meta mark set 1 accept
+	}
+}"
+	;;
+	"forward")
+		nftables_rules="
+flush ruleset
+table inet filter {
+	chain divert {
+		type filter hook prerouting priority 0; policy accept;
+		$nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept
+	}
+}"
+	;;
+	*)
+	echo "FAIL: unsupported parameter for traffic origin"
+	exit 255
+	;;
+	esac
+
+	# shellcheck disable=SC2046 # Intended splitting of ip_command
+	ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100
+	ip netns exec "$nsrouter" $ip_command route add local "${ns2_ip}" dev lo table 100
+	echo "$nftables_rules" | ip netns exec "$nsrouter" nft -f /dev/stdin
+
+	timeout "$timeout" ip netns exec "$nsrouter" socat "$socat_ipproto" tcp-listen:12345,fork,ip-transparent SYSTEM:"cat" 2>/dev/null &
+	local tproxy_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null &
+	local server2_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null &
+	local server3_pid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-t"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-t"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-t"
+
+	local result
+	# request from ns1 to ns2 (forwarded traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns2_target")
+	if [ "$result" == "$expect_ns1_ns2" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended"
+		ret=1
+	fi
+
+	# request from ns1 to ns3(forwarded traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns3_target")
+	if [ "$result" = "$expect_ns1_ns3" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
+		ret=1
+	fi
+
+	# request from nsrouter to ns2 (localy originated traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns2_target")
+	if [ "$result" == "$expect_nsrouter_ns2" ] ;then
+		echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended"
+		ret=1
+	fi
+
+	# request from nsrouter to ns3 (localy originated traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns3_target")
+	if [ "$result" = "$expect_nsrouter_ns3" ] ;then
+		echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\"  as intended"
+		ret=1
+	fi
+
+	# cleanup
+	kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null
+	# shellcheck disable=SC2046 # Intended splitting of ip_command
+	ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100
+	ip netns exec "$nsrouter" $ip_command route flush table 100
+}
+
+
+test_ipv4_tcp_forward()
+{
+	local traffic_origin="forward"
+	local ip_proto="ip"
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="PONG_NS2"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	test_tproxy     "$traffic_origin" \
+			"$ip_proto" \
+			"$expect_ns1_ns2" \
+			"$expect_ns1_ns3" \
+			"$expect_nsrouter_ns2" \
+			"$expect_nsrouter_ns3"
+}
+
+test_ipv4_tcp_local()
+{
+	local traffic_origin="local"
+	local ip_proto="ip"
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="I_M_PROXIED"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	test_tproxy     "$traffic_origin" \
+			"$ip_proto" \
+			"$expect_ns1_ns2" \
+			"$expect_ns1_ns3" \
+			"$expect_nsrouter_ns2" \
+			"$expect_nsrouter_ns3"
+}
+
+test_ipv6_tcp_forward()
+{
+	local traffic_origin="forward"
+	local ip_proto="ip6"
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="PONG_NS2"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	test_tproxy     "$traffic_origin" \
+			"$ip_proto" \
+			"$expect_ns1_ns2" \
+			"$expect_ns1_ns3" \
+			"$expect_nsrouter_ns2" \
+			"$expect_nsrouter_ns3"
+}
+
+test_ipv6_tcp_local()
+{
+	local traffic_origin="local"
+	local ip_proto="ip6"
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="I_M_PROXIED"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	test_tproxy     "$traffic_origin" \
+			"$ip_proto" \
+			"$expect_ns1_ns2" \
+			"$expect_ns1_ns3" \
+			"$expect_nsrouter_ns2" \
+			"$expect_nsrouter_ns3"
+}
+
+if test_ping; then
+	# queue bypass works (rules were skipped, no listener)
+	echo "PASS: ${ns1} can reach ${ns2}"
+else
+	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+	exit $ret
+fi
+
+test_ipv4_tcp_forward
+test_ipv4_tcp_local
+test_ipv6_tcp_forward
+test_ipv6_tcp_local
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh b/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh
new file mode 100755
index 000000000000..d16de13fe5a7
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+#
+# This tests tproxy on the following scenario:
+#
+#                         +------------+
+# +-------+               |  nsrouter  |                  +-------+
+# |ns1    |.99          .1|            |.1             .99|    ns2|
+# |   eth0|---------------|veth0  veth1|------------------|eth0   |
+# |       |  10.0.1.0/24  |            |   10.0.2.0/24    |       |
+# +-------+  dead:1::/64  |    veth2   |   dead:2::/64    +-------+
+#                         +------------+
+#                                |.1
+#                                |
+#                                |
+#                                |                        +-------+
+#                                |                     .99|    ns3|
+#                                +------------------------|eth0   |
+#                                       10.0.3.0/24       |       |
+#                                       dead:3::/64       +-------+
+#
+# The tproxy implementation acts as an echo server so the client
+# must receive the same message it sent if it has been proxied.
+# If is not proxied the servers return PONG_NS# with the number
+# of the namespace the server is running.
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+# UDP is slow
+timeout=15
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+	ip netns pids "$ns3" | xargs kill 2>/dev/null
+	ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+setup_ns ns1 ns2 ns3 nsrouter
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+test_ping() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+	return 2
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then
+	return 2
+  fi
+
+  return 0
+}
+
+test_ping_router() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+	return 3
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+	return 4
+  fi
+
+  return 0
+}
+
+
+listener_ready()
+{
+	local ns="$1"
+	local port="$2"
+	local proto="$3"
+	ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
+test_tproxy_udp_forward()
+{
+	local ip_proto="$1"
+
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="PONG_NS2"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	# derived variables
+	local testname="test_${ip_proto}_udp_forward"
+	local socat_ipproto
+	local ns1_ip
+	local ns2_ip
+	local ns3_ip
+	local ns1_ip_port
+	local ns2_ip_port
+	local ns3_ip_port
+	local ip_command
+
+	# socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1)
+	case $ip_proto in
+	"ip")
+		socat_ipproto="-4"
+		ns1_ip=10.0.1.99
+		ns2_ip=10.0.2.99
+		ns3_ip=10.0.3.99
+		ns1_ip_port="$ns1_ip:18888"
+		ns2_ip_port="$ns2_ip:8080"
+		ns3_ip_port="$ns3_ip:8080"
+		ip_command="ip"
+	;;
+	"ip6")
+		socat_ipproto="-6"
+		ns1_ip=dead:1::99
+		ns2_ip=dead:2::99
+		ns3_ip=dead:3::99
+		ns1_ip_port="[$ns1_ip]:18888"
+		ns2_ip_port="[$ns2_ip]:8080"
+		ns3_ip_port="[$ns3_ip]:8080"
+		ip_command="ip -6"
+	;;
+	*)
+	echo "FAIL: unsupported protocol"
+	exit 255
+	;;
+	esac
+
+	# shellcheck disable=SC2046 # Intended splitting of ip_command
+	ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100
+	ip netns exec "$nsrouter" $ip_command route add local "$ns2_ip" dev lo table 100
+	ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+	chain divert {
+		type filter hook prerouting priority 0; policy accept;
+		$ip_proto daddr $ns2_ip udp dport 8080 tproxy $ip_proto to :12345 meta mark set 1 accept
+	}
+}
+EOF
+
+	timeout "$timeout" ip netns exec "$nsrouter" socat -u "$socat_ipproto" udp-listen:12345,fork,ip-transparent,reuseport udp:"$ns1_ip_port",ip-transparent,reuseport,bind="$ns2_ip_port" 2>/dev/null &
+	local tproxy_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null &
+	local server2_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null &
+	local server3_pid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-u"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-u"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-u"
+
+	local result
+	# request from ns1 to ns2 (forwarded traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port",sourceport=18888)
+	if [ "$result" == "$expect_ns1_ns2" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended"
+		ret=1
+	fi
+
+	# request from ns1 to ns3 (forwarded traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port")
+	if [ "$result" = "$expect_ns1_ns3" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
+		ret=1
+	fi
+
+	# request from nsrouter to ns2 (localy originated traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port")
+	if [ "$result" == "$expect_nsrouter_ns2" ] ;then
+		echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended"
+		ret=1
+	fi
+
+	# request from nsrouter to ns3 (localy originated traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port")
+	if [ "$result" = "$expect_nsrouter_ns3" ] ;then
+		echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\"  as intended"
+		ret=1
+	fi
+
+	# cleanup
+	kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null
+	# shellcheck disable=SC2046 # Intended splitting of ip_command
+	ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100
+	ip netns exec "$nsrouter" $ip_command route flush table 100
+}
+
+
+if test_ping; then
+	# queue bypass works (rules were skipped, no listener)
+	echo "PASS: ${ns1} can reach ${ns2}"
+else
+	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+	exit $ret
+fi
+
+test_tproxy_udp_forward "ip"
+test_tproxy_udp_forward "ip6"
+
+exit $ret
-- 
cgit v1.2.3


From e306e3739d9a35c89176281f9ff6c600fcc859a4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 18 Sep 2024 15:16:33 +0200
Subject: kselftest: add test for nfqueue induced conntrack race

The netfilter race happens when two packets with the same tuple are DNATed
and enqueued with nfqueue in the postrouting hook.

Once one of the packet is reinjected it may be DNATed again to a different
destination, but the conntrack entry remains the same and the return packet
was dropped.

Based on earlier patch from Antonio Ojea.

Link: https://bugzilla.netfilter.org/show_bug.cgi?id=1766
Co-developed-by: Antonio Ojea <aojea@google.com>
Signed-off-by: Antonio Ojea <aojea@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/net/netfilter/nft_queue.sh | 92 +++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
index d66e3c4dfec6..a9d109fcc15c 100755
--- a/tools/testing/selftests/net/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -31,7 +31,7 @@ modprobe -q sctp
 
 trap cleanup EXIT
 
-setup_ns ns1 ns2 nsrouter
+setup_ns ns1 ns2 ns3 nsrouter
 
 TMPFILE0=$(mktemp)
 TMPFILE1=$(mktemp)
@@ -48,6 +48,7 @@ if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" >
     exit $ksft_skip
 fi
 ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
 
 ip -net "$nsrouter" link set veth0 up
 ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
@@ -57,8 +58,13 @@ ip -net "$nsrouter" link set veth1 up
 ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
 ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
 
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
 ip -net "$ns1" link set eth0 up
 ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
 
 ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
 ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
@@ -70,6 +76,11 @@ ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
 ip -net "$ns2" route add default via 10.0.2.1
 ip -net "$ns2" route add default via dead:2::1
 
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
 load_ruleset() {
 	local name=$1
 	local prio=$2
@@ -473,6 +484,83 @@ EOF
 	check_output_files "$TMPINPUT" "$TMPFILE1" "sctp output"
 }
 
+udp_listener_ready()
+{
+	ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345
+}
+
+output_files_written()
+{
+	test -s "$1" && test -s "$2"
+}
+
+test_udp_ct_race()
+{
+        ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet udpq {
+	chain prerouting {
+		type nat hook prerouting priority dstnat - 5; policy accept;
+		ip daddr 10.6.6.6 udp dport 12345 counter dnat to numgen inc mod 2 map { 0 : 10.0.2.99, 1 : 10.0.3.99 }
+	}
+        chain postrouting {
+		type filter hook postrouting priority srcnat - 5; policy accept;
+		udp dport 12345 counter queue num 12
+        }
+}
+EOF
+	:> "$TMPFILE1"
+	:> "$TMPFILE2"
+
+	timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE1",trunc &
+	local rpid1=$!
+
+	timeout 10 ip netns exec "$ns3" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE2",trunc &
+	local rpid2=$!
+
+	ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 &
+	local nfqpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2"
+	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3"
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12
+
+	# Send two packets, one should end up in ns1, other in ns2.
+	# This is because nfqueue will delay packet for long enough so that
+	# second packet will not find existing conntrack entry.
+	echo "Packet 1" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
+	echo "Packet 2" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
+
+	busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2"
+
+	kill "$nfqpid"
+
+	if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then
+		echo "FAIL: Expected One udp conntrack entry"
+		ip netns exec "$nsrouter" conntrack -L -p udp --dport 12345
+		ret=1
+	fi
+
+	if ! ip netns exec "$nsrouter" nft delete table inet udpq; then
+		echo "FAIL: Could not delete udpq table"
+		ret=1
+		return
+	fi
+
+	NUMLINES1=$(wc -l < "$TMPFILE1")
+	NUMLINES2=$(wc -l < "$TMPFILE2")
+
+	if [ "$NUMLINES1" -ne 1 ] || [ "$NUMLINES2" -ne 1 ]; then
+		ret=1
+		echo "FAIL: uneven udp packet distribution: $NUMLINES1 $NUMLINES2"
+		echo -n "$TMPFILE1: ";cat "$TMPFILE1"
+		echo -n "$TMPFILE2: ";cat "$TMPFILE2"
+		return
+	fi
+
+	echo "PASS: both udp receivers got one packet each"
+}
+
 test_queue_removal()
 {
 	read tainted_then < /proc/sys/kernel/tainted
@@ -512,6 +600,7 @@ EOF
 ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
 ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
 ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
 
 load_ruleset "filter" 0
 
@@ -549,6 +638,7 @@ test_tcp_localhost_connectclose
 test_tcp_localhost_requeue
 test_sctp_forward
 test_sctp_output
+test_udp_ct_race
 
 # should be last, adds vrf device in ns1 and changes routes
 test_icmp_vrf
-- 
cgit v1.2.3


From fc786304ad9803e8bb86b8599bc64d1c1746c75f Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 19 Sep 2024 14:40:00 +0200
Subject: selftests: netfilter: Avoid hanging ipvs.sh

If the client can't reach the server, the latter remains listening
forever. Kill it after 5s of waiting.

Fixes: 867d2190799a ("selftests: netfilter: add ipvs test script")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/net/netfilter/ipvs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh
index 4ceee9fb3949..d3edb16cd4b3 100755
--- a/tools/testing/selftests/net/netfilter/ipvs.sh
+++ b/tools/testing/selftests/net/netfilter/ipvs.sh
@@ -97,7 +97,7 @@ cleanup() {
 }
 
 server_listen() {
-	ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" &
+	ip netns exec "$ns2" timeout 5 socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" &
 	server_pid=$!
 	sleep 0.2
 }
-- 
cgit v1.2.3


From c234c6534040b1c1f8adcaf44702fc3e584cb1fe Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 24 Sep 2024 19:07:24 +0100
Subject: tools: fix shared radix-tree build

The shared radix-tree build is not correctly recompiling when
lib/maple_tree.c and lib/test_maple_tree.c are modified - fix this by
adding these core components to the SHARED_DEPS list.

Additionally, add missing header guards to shared header files.

Link: https://lkml.kernel.org/r/20240924180724.112169-1-lorenzo.stoakes@oracle.com
Fixes: 74579d8dab47 ("tools: separate out shared radix-tree components")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/shared/maple-shared.h  | 4 ++++
 tools/testing/shared/shared.h        | 4 ++++
 tools/testing/shared/shared.mk       | 4 +++-
 tools/testing/shared/xarray-shared.h | 4 ++++
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/shared/maple-shared.h b/tools/testing/shared/maple-shared.h
index 3d847edd149d..dc4d30f3860b 100644
--- a/tools/testing/shared/maple-shared.h
+++ b/tools/testing/shared/maple-shared.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef __MAPLE_SHARED_H__
+#define __MAPLE_SHARED_H__
 
 #define CONFIG_DEBUG_MAPLE_TREE
 #define CONFIG_MAPLE_SEARCH
@@ -7,3 +9,5 @@
 #include <stdlib.h>
 #include <time.h>
 #include "linux/init.h"
+
+#endif /* __MAPLE_SHARED_H__ */
diff --git a/tools/testing/shared/shared.h b/tools/testing/shared/shared.h
index f08f683812ad..13fb4d39966b 100644
--- a/tools/testing/shared/shared.h
+++ b/tools/testing/shared/shared.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __SHARED_H__
+#define __SHARED_H__
 
 #include <linux/types.h>
 #include <linux/bug.h>
@@ -31,3 +33,5 @@
 #ifndef dump_stack
 #define dump_stack()	assert(0)
 #endif
+
+#endif /* __SHARED_H__ */
diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk
index a05f0588513a..a6bc51d0b0bf 100644
--- a/tools/testing/shared/shared.mk
+++ b/tools/testing/shared/shared.mk
@@ -15,7 +15,9 @@ SHARED_DEPS = Makefile ../shared/shared.mk ../shared/*.h generated/map-shift.h \
 	../../../include/linux/maple_tree.h \
 	../../../include/linux/radix-tree.h \
 	../../../lib/radix-tree.h \
-	../../../include/linux/idr.h
+	../../../include/linux/idr.h \
+	../../../lib/maple_tree.c \
+	../../../lib/test_maple_tree.c
 
 ifndef SHIFT
 	SHIFT=3
diff --git a/tools/testing/shared/xarray-shared.h b/tools/testing/shared/xarray-shared.h
index ac2d16ff53ae..d50de7884803 100644
--- a/tools/testing/shared/xarray-shared.h
+++ b/tools/testing/shared/xarray-shared.h
@@ -1,4 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef __XARRAY_SHARED_H__
+#define __XARRAY_SHARED_H__
 
 #define XA_DEBUG
 #include "shared.h"
+
+#endif /* __XARRAY_SHARED_H__ */
-- 
cgit v1.2.3


From f30beffd977e98c33550bbeb6f278d157ff54844 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Mon, 23 Sep 2024 10:38:36 +0500
Subject: kselftests: mm: fix wrong __NR_userfaultfd value

grep -rnIF "#define __NR_userfaultfd"
tools/include/uapi/asm-generic/unistd.h:681:#define __NR_userfaultfd 282
arch/x86/include/generated/uapi/asm/unistd_32.h:374:#define
__NR_userfaultfd 374
arch/x86/include/generated/uapi/asm/unistd_64.h:327:#define
__NR_userfaultfd 323
arch/x86/include/generated/uapi/asm/unistd_x32.h:282:#define
__NR_userfaultfd (__X32_SYSCALL_BIT + 323)
arch/arm/include/generated/uapi/asm/unistd-eabi.h:347:#define
__NR_userfaultfd (__NR_SYSCALL_BASE + 388)
arch/arm/include/generated/uapi/asm/unistd-oabi.h:359:#define
__NR_userfaultfd (__NR_SYSCALL_BASE + 388)
include/uapi/asm-generic/unistd.h:681:#define __NR_userfaultfd 282

The number is dependent on the architecture. The above data shows that:
x86	374
x86_64	323

The value of __NR_userfaultfd was changed to 282 when asm-generic/unistd.h
was included.  It makes the test to fail every time as the correct number
of this syscall on x86_64 is 323.  Fix the header to asm/unistd.h.

Link: https://lkml.kernel.org/r/20240923053836.3270393-1-usama.anjum@collabora.com
Fixes: a5c6bc590094 ("selftests/mm: remove local __NR_* definitions")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pagemap_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index fc90af2a97b8..bcc73b4e805c 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -15,7 +15,7 @@
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <math.h>
-#include <asm-generic/unistd.h>
+#include <asm/unistd.h>
 #include <pthread.h>
 #include <sys/resource.h>
 #include <assert.h>
-- 
cgit v1.2.3


From 4b721fcc094e9eb6dd4702df8d79ab11e120833d Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 24 Sep 2024 13:47:23 +0200
Subject: selftests: vDSO: align stack for O2-optimized memcpy

When switching on -O2, gcc generates SSE2 instructions that assume a
16-byte aligned stack, which the standalone test's start point wasn't
aligning. Fix this with the usual alignment sequence.

Fixes: ecb8bd70d51 ("selftests: vDSO: build tests with O2 optimization")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202409241558.98e13f6f-oliver.sang@intel.com
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/vDSO/vdso_standalone_test_x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
index 27f6fdf11969..644915862af8 100644
--- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
+++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
@@ -131,6 +131,8 @@ asm (
 	"_start:\n\t"
 #ifdef __x86_64__
 	"mov %rsp,%rdi\n\t"
+	"and $-16,%rsp\n\t"
+	"sub $8,%rsp\n\t"
 	"jmp c_main"
 #else
 	"push %esp\n\t"
-- 
cgit v1.2.3